In [0]:
%%sh
# aws s3 ls s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=MONTH/date=2020-03-31/

aws s3 ls s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v3/fact/granularity=monthly/

aws s3 ls s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v3/fact/granularity=monthly/ --recursive --summarize --human | tail -5


In [0]:

import datetime
from pyspark.sql import types as T
from pyspark.sql import functions as F

from aadatapipelinecore.core.urn import Urn
from aadatapipelinecore.core.pipeline import type_
from applications.common.parser import SqlParser
from applications.common.executor import SqlExecutor
from applications.auto_pipeline.transform import _view
from aadatapipelinecore.core.utils.spark import eject_all_caches
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

spark.sparkContext.addPyFile("/home/hadoop/bdp/application/libs/python/dependencies.zip")
import aaplproxy

"""
get date:  [ [month, [days]], [month, [days]], [month, [days]], ....... ]
"""
def get_date_list(start_date, end_date, freq="D"):
    import pandas as pd
    """
    freq:   D: calendar day frequency
            M: month end frequency
            MS: month start frequency
            A, Y: year end frequency
            AS, YS: year start frequency
    """
    date_list = [x.strftime('%Y-%m-%d') for x in list(pd.date_range(start=start_date, end=end_date, freq=freq))]
    return date_list

# start = "2010-07-31"  # prod
start = "2019-08-31"    # test
end = "2019-08-31"

monthly = get_date_list(start, end, freq='M')
print monthly


date_list = []
for m in monthly:
    d = get_date_list(m[:8]+'01', m, freq='D')  # start = the first day of each month; end = each month
    month_and_day = [m, d]
    date_list.append(month_and_day)

print date_list



class DryRunSqlExecutor(SqlExecutor):
    def _verify_tasks(self):
        pass


def run(spark, raw_data, sql_text, dry_run=True):
    urn = Urn(namespace=raw_data["namespace"])
    source_data_list = raw_data.pop("source")
    raw_data.update(raw_data.pop("options"))
    _view(spark, sql_text, None, source_data_list)
    context = raw_data
    tasks = SqlParser(spark, sql_text, context).parse()
    if dry_run:
        sql_executor = DryRunSqlExecutor
    else:
        sql_executor = SqlExecutor
    sql_executor(urn, spark, tasks, type_.EventType.TRANSFORM, context).run()

# CSV schema
from pyspark.sql import types as T
from pyspark.sql import functions as F

csv_schema = T.StructType(
    [
        T.StructField("store_id", T.IntegerType(), True),
        T.StructField("date", T.DateType(), True),
        T.StructField("platform_id", T.IntegerType(), True),
        T.StructField("vertical", T.IntegerType(), True),
        T.StructField("feed", T.IntegerType(), True),
        T.StructField("id", T.LongType(), True),
        T.StructField("est", T.IntegerType(), True),
        T.StructField("category_id", T.IntegerType(), True),
        T.StructField("rank", T.IntegerType(), True)
    ]
)

def test_monthkly_data(test_data):
    print test_data[0]
    print test_data[1]
    
    month_indicator = test_data[0]
    if month_indicator == '2010-07-31':
        
        # only csv, but date range is '2010-07-04' to '2010-07-31'
        temp_date_range = get_date_list('2010-07-04', '2010-07-31', freq='D')
        df_1 = spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(csv_schema).csv( "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/ios/sbe_est_app/*/"%",".join(temp_date_range), sep="\t").withColumn("platform", F.lit("ios")).select('id', 'store_id', 'category_id', 'platform_id', 'vertical', 'rank', 'feed', 'est', 'date', 'platform').cache()

    elif month_indicator > '2010-08-01' and month_indicator < '2019-07-01':

        # only csv
        df_1 = spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(csv_schema).csv("s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/ios/sbe_est_app/*/"%",".join(test_data[1]), sep="\t").withColumn("platform", F.lit("ios")).select('id', 'store_id', 'category_id', 'platform_id', 'vertical', 'rank', 'feed', 'est', 'date', 'platform').cache()

    elif month_indicator == '2019-07-31':
        # half is csv, half is parquet
        # First half of 2019-07
        temp_date_range = get_date_list('2019-07-01', '2019-07-14')
        first_half_month_df = spark.read.option("basePath", "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(csv_schema).csv("s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/ios/sbe_est_app/*/"%",".join(temp_date_range), sep="\t").withColumn("platform", F.lit("ios")).select('id', 'store_id', 'category_id', 'platform_id', 'vertical', 'rank', 'feed', 'est', 'date', 'platform').cache()

        # Second half of 2019-07
        temp_date_range = get_date_list('2019-07-15', '2019-07-31')
        second_half_month = spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/").parquet("s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/date={%s}/platform=*/*/" %  ",".join(temp_date_range)).select('id', 'store_id', 'category_id', 'platform_id', 'vertical', 'rank', 'feed', 'est', 'date', 'platform').cache()
        df_1 = first_half_month_df.union(second_half_month)
        

    else:  # month_indicator >= '2019-08-31'

        # only parquet
        df_1 = spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/").parquet("s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/date={%s}/platform=*/*/" %  ",".join(test_data[1])) .cache()
        df_1.show(10)

    # df_1.createOrReplaceTempView("daily_data")
    
    # weekly_df_ho = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v3/fact/").where("granularity='monthly' and date='{}' and data_stage='final'".format(test_data[0])).cache()
    # weekly_df_ho.createOrReplaceTempView("unified_monthly")
    
    eject_all_caches(spark)


sc.parallelize(map(test_monthkly_data, date_list), 1)

# diff_df1 = spark.sql("select * from country_category_mapping_raw except all select app_id, country_code, device_code, free_app_download, paid_app_download, revenue from unified_monthly ")
# diff_df2 = spark.sql("select app_id, country_code, device_code, free_app_download, paid_app_download, revenue from unified_monthly  except all select * from country_category_mapping_raw")
# if diff_df1.take(1) or diff_df2.take(1):
#     print "FAILED: ", sar_list[0]
#     diff_df1.show(100)
#     diff_df2.show(100)
print "END"

In [0]:

import datetime
from pyspark.sql import types as T
from pyspark.sql import functions as F

from aadatapipelinecore.core.urn import Urn
from aadatapipelinecore.core.pipeline import type_
from applications.common.parser import SqlParser
from applications.common.executor import SqlExecutor
from applications.auto_pipeline.transform import _view
from aadatapipelinecore.core.utils.spark import eject_all_caches
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

spark.sparkContext.addPyFile("/home/hadoop/bdp/application/libs/python/dependencies.zip")
import aaplproxy

start = "2017-04-01"
end = "2020-03-01"
# end = "2012-05-01"
real_date1 = datetime.date(*[int(x) for x in start.split('-')])
real_date2 = datetime.date(*[int(x) for x in end.split('-')])
date_range = real_date2 - real_date1
dates = list()
for days in xrange(date_range.days):
    dates.append((str(real_date1 + datetime.timedelta(days))))



class DryRunSqlExecutor(SqlExecutor):
    def _verify_tasks(self):
        pass


def run(spark, raw_data, sql_text, dry_run=True):
    urn = Urn(namespace=raw_data["namespace"])
    source_data_list = raw_data.pop("source")
    raw_data.update(raw_data.pop("options"))
    _view(spark, sql_text, None, source_data_list)
    context = raw_data
    tasks = SqlParser(spark, sql_text, context).parse()
    if dry_run:
        sql_executor = DryRunSqlExecutor
    else:
        sql_executor = SqlExecutor
    sql_executor(urn, spark, tasks, type_.EventType.TRANSFORM, context).run()

# CSV schema
from pyspark.sql import types as T
from pyspark.sql import functions as F

csv_schema = T.StructType(
    [
        T.StructField("store_id", T.IntegerType(), True),
        T.StructField("date", T.DateType(), True),
        T.StructField("platform_id", T.IntegerType(), True),
        T.StructField("vertical", T.IntegerType(), True),
        T.StructField("feed", T.IntegerType(), True),
        T.StructField("id", T.LongType(), True),
        T.StructField("est", T.IntegerType(), True),
        T.StructField("category_id", T.IntegerType(), True),
        T.StructField("rank", T.IntegerType(), True)
    ]
)

def test_monthkly_data(test_data):
    print test_data
    month_indicator = test_data
    ### 1. only csv, but date range is '2010-07-04' to '2010-07-31' ###
    if month_indicator == '2010-07-31':
        temp_date_range = get_date_list('2010-07-04', '2010-07-31', freq='D')
        df_1 = spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(csv_schema).csv( "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/ios/sbe_est_app/*/"%(temp_date_range), sep="\t").withColumn("platform", F.lit("ios")).select('id', 'store_id', 'category_id', 'platform_id', 'vertical', 'rank', 'feed', 'est', 'date', 'platform').cache()
        

    ### 2. only csv
    elif month_indicator > '2010-08-01' and month_indicator < '2019-07-14':
        df_ios = spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(csv_schema).csv("s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/ios/sbe_est_app/*/"%(test_data), sep="\t").withColumn("platform", F.lit("ios")).select('id', 'store_id', 'category_id', 'platform_id', 'vertical', 'rank', 'feed', 'est', 'date', 'platform').cache()
        df_android = spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(csv_schema).csv("s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/android/sbe_est_app/*/"%(test_data), sep="\t").withColumn("platform", F.lit("android")).select('id', 'store_id', 'category_id', 'platform_id', 'vertical', 'rank', 'feed', 'est', 'date', 'platform').cache()
        df_1 = df_ios.union(df_android)
        

    # ### 3. half is csv, half is parquet ###
    # elif month_indicator == '2019-07-31':
    #     # First half of 2019-07
    #     temp_date_range = get_date_list('2019-07-01', '2019-07-14')
    #     first_half_month_df = spark.read.option("basePath", "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(csv_schema).csv("s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/ios/sbe_est_app/*/"%(temp_date_range), sep="\t").withColumn("platform", F.lit("ios")).select('id', 'store_id', 'category_id', 'platform_id', 'vertical', 'rank', 'feed', 'est', 'date', 'platform').cache()
    #     # Second half of 2019-07
    #     temp_date_range = get_date_list('2019-07-15', '2019-07-31')
    #     second_half_month = spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/").parquet("s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/date={%s}/platform=*/*/" % (temp_date_range)).select('id', 'store_id', 'category_id', 'platform_id', 'vertical', 'rank', 'feed', 'est', 'date', 'platform').cache()
    #     df_1 = first_half_month_df.union(second_half_month)
        

    ### 4. only parquet ###
    else:  # month_indicator >= '2019-08-31'
        df_1 = spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/").parquet("s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/date={%s}/platform=*/*/" % (test_data)).cache()

    df_1.createOrReplaceTempView("daily_data")
    
    daily_est_load = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-load.v3/fact/").where("granularity='daily' and date='{}'".format(test_data)).cache()
    daily_est_load.createOrReplaceTempView("daily_unified_est")



    sql_text = """
    
    WITH filter_top_N_raw_data AS(
    SELECT
     distinct
      id,
      Sum(est) AS est,
      store_id,
      platform_id,
      feed,
      vertical,
      platform
    FROM
      (
        SELECT
          DISTINCT d1.id,
          d1.est,
          d1.store_id,
          d1.date,
          d1.feed,
          d1.vertical,
          d1.platform_id,
          d1.platform
        FROM
          daily_data AS d1
          JOIN daily_data AS d2 
          ON d1.id = d2.id
          AND d1.store_id = d2.store_id
          AND d1.feed = d2.feed
          AND d1.vertical = d2.vertical
          AND d1.platform_id = d2.platform_id
        WHERE (d1.rank <= 4000 and d2.rank<=4000 and d1.store_id == 0 and d1.platform = 'ios' ) 
            OR (d1.rank <= 1000 and d2.rank<=1000 and  d1.store_id != 0 and d1.platform = 'ios' )
            OR  (d1.rank <= 4000 and d2.rank<=4000 and d1.store_id == 1000 and d1.platform = 'android' ) 
            OR (d1.rank <= 1000 and d2.rank<=1000 and  d1.store_id != 1000 and d1.platform = 'android' )
      ) AS t
    WHERE
      feed IN (
        0,
        1,
        2,
        101,
        100,
        102
      )
    GROUP BY
      id,
      store_id,
      platform_id,
      vertical,
      feed,
      platform);
      
     WITH replace_metric AS (
     SELECT * ,
         case 
        when feed='0' and platform='ios' then 'free_app_download'
        when feed='1' and platform='ios' then 'paid_app_download'
        when feed='2' and platform='ios' then 'revenue' 
        when feed='101' and platform='ios' then 'free_app_download' 
        when feed='100' and platform='ios' then 'paid_app_download' 
        when feed='102' and platform='ios' then 'revenue' 
        when feed='0' and platform='android' then 'free_app_download' 
        when feed='1' and platform='android' then 'paid_app_download' 
        when feed='2' and platform='android' then 'revenue' 
        end as metric from filter_top_N_raw_data);
      
      
         WITH replace_metric_device_code AS (
        SELECT * ,
         case 
        when feed='0' and platform='ios' then 'ios-phone'
        when feed='1' and platform='ios' then 'ios-phone'
        when feed='2' and platform='ios' then 'ios-phone' 
        when feed='101' and platform='ios' then 'ios-tablet' 
        when feed='100' and platform='ios' then 'ios-tablet' 
        when feed='102' and platform='ios' then 'ios-tablet' 
        when feed='0' and platform='android' then 'android-all' 
        when feed='1' and platform='android' then 'android-all' 
        when feed='2' and platform='android' then 'android-all' 
        end as device_code from replace_metric);


    WITH group_by_metric_1 AS (
        SELECT max(est) as est, id, metric,device_code, store_id, platform from replace_metric_device_code where store_id not in (3,4,5,6) and device_code in ('ios-phone' ,'ios-tablet' ) and feed not in (1000, 1001, 1002) group by id, store_id, metric,device_code, platform
        );
        
    WITH group_by_metric_2 AS (
        SELECT max(est) as est, id, metric,device_code, store_id, platform from replace_metric_device_code where store_id not in ( 1003, 1005, 1006,1007) and device_code='android-all' and feed not in (1000, 1001, 1002) group by id, store_id, metric,device_code, platform
    );
    
    WITH group_by_metric AS(
        SELECT * FROM group_by_metric_1
        UNION ALL
        SELECT * FROM group_by_metric_2
        );

      -- pivot metric column
    WITH pivot_metric_raw AS (

    SELECT 
        distinct id as app_id, store_id, platform, device_code, free_app_download,revenue, paid_app_download
    FROM
          group_by_metric
     PIVOT (
        max(est) 
    	FOR metric IN ('free_app_download','revenue', 'paid_app_download')
      )
    );
    
    
    -- union all platform with country_code mapping

    WITH country_code_mapping AS (
    select *, 'android' as market_code from android_country_mapping 
    UNION ALL select *, 'ios' market_code from ios_country_mapping
    UNION ALL select 143502, 'VE', 'VESA', 'ios'
    UNION ALL select 0, 'WW', 'worldwide', 'ios'
    UNION ALL select 36, 'CZ', 'CZ', 'android'
    UNION ALL select 5, 'ES', 'ES', 'android'

    );



    -- map raw with country_code

    WITH country_category_mapping_raw AS (
    select app_id, country_code, device_code, free_app_download, paid_app_download, revenue 
     from country_code_mapping 
     inner join 
         pivot_metric_raw 
     on 
         country_code_mapping.store_id=pivot_metric_raw.store_id 
     and 
         country_code_mapping.market_code=pivot_metric_raw.platform
    where country_name!='Global'
    );


      """
    
    # store_unified , rank_unified
    namespace = "aa.store.market-size.v1"
    ingest_msg = {
        "namespace": "aa.store.market-size.v1",
        "job_type": "routine",
        "options": {},
        "source": [
            {
            "data_encoding": "csv",
            "compression": "gzip",
            "name":"ios_country_mapping",
            "data_schema": [
                            {"name":"store_id","type":"int","nullable": False},
                            {"name":"country_code","type":"string","nullable": False},
                            {"name":"country_name","type":"string","nullable": False}
                            ],
             "csv_options": {
          'header': True,
          'sep': '\t',
          'quote': '',
          'encoding': 'utf-8',
          'escape': ''
          },

            "path": ["s3://b2c-prod-dca-store-estimates/store_back/dimension/IOS_COUNTRY_MAPPING"],
        },   
        {
            "data_encoding": "csv",
            "compression": "gzip",
            "name":"android_country_mapping",
            "data_schema": [
                            {"name":"store_id","type":"int","nullable": False},
                            {"name":"country_code","type":"string","nullable": False},
                            {"name":"country_name","type":"string","nullable": False}
                            ],
             "csv_options": {
          'header': True,
          'sep': '\t',
          'quote': '',
          'encoding': 'utf-8',
          'escape': ''
          },

            "path": ["s3://b2c-prod-dca-store-estimates/store_back/dimension/ANDROID_COUNTRY_MAPPING"],
        }
        ]
    }
    
    run(spark, ingest_msg, sql_text)
    
    
    diff_df1 = spark.sql("select * from country_category_mapping_raw except all select app_id, country_code, device_code, est_free_app_download as free_app_download, est_paid_app_download as paid_app_download, est_revenue as revenue from daily_unified_est ")
    diff_df2 = spark.sql("select app_id, country_code, device_code, est_free_app_download as free_app_download, est_paid_app_download as paid_app_download, est_revenue as revenue from daily_unified_est  except all select * from country_category_mapping_raw")

    diff_df1.show()
    diff_df2.show() 

    
    eject_all_caches(spark)


sc.parallelize(map(test_monthkly_data, dates), 1)
    


In [0]:
%%sh

aws s3 ls s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/date=2020-03-31/ --recursive | sort -n
echo '*****'
aws s3 ls s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date=2020-03-31/ --recursive  | sort -n


In [0]:

import datetime
from pyspark.sql import types as T
from pyspark.sql import functions as F

from aadatapipelinecore.core.urn import Urn
from aadatapipelinecore.core.pipeline import type_
from applications.common.parser import SqlParser
from applications.common.executor import SqlExecutor
from applications.auto_pipeline.transform import _view
from aadatapipelinecore.core.utils.spark import eject_all_caches
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

spark.sparkContext.addPyFile("/home/hadoop/bdp/application/libs/python/dependencies.zip")
import aaplproxy

start = "2020-05-02"
end = "2020-05-03"
# end = "2012-05-01"
real_date1 = datetime.date(*[int(x) for x in start.split('-')])
real_date2 = datetime.date(*[int(x) for x in end.split('-')])
date_range = real_date2 - real_date1
dates = list()
for days in xrange(date_range.days):
    dates.append((str(real_date1 + datetime.timedelta(days))))



class DryRunSqlExecutor(SqlExecutor):
    def _verify_tasks(self):
        pass


def run(spark, raw_data, sql_text, dry_run=True):
    urn = Urn(namespace=raw_data["namespace"])
    source_data_list = raw_data.pop("source")
    raw_data.update(raw_data.pop("options"))
    _view(spark, sql_text, None, source_data_list)
    context = raw_data
    tasks = SqlParser(spark, sql_text, context).parse()
    if dry_run:
        sql_executor = DryRunSqlExecutor
    else:
        sql_executor = SqlExecutor
    sql_executor(urn, spark, tasks, type_.EventType.TRANSFORM, context).run()

# CSV schema
from pyspark.sql import types as T
from pyspark.sql import functions as F

csv_schema = T.StructType(
    [
        T.StructField("store_id", T.IntegerType(), True),
        T.StructField("date", T.DateType(), True),
        T.StructField("platform_id", T.IntegerType(), True),
        T.StructField("vertical", T.IntegerType(), True),
        T.StructField("feed", T.IntegerType(), True),
        T.StructField("id", T.LongType(), True),
        T.StructField("est", T.IntegerType(), True),
        T.StructField("category_id", T.IntegerType(), True),
        T.StructField("rank", T.IntegerType(), True)
    ]
)

def test_monthkly_data(test_data):
    print test_data
    month_indicator = test_data
    ### 1. only csv, but date range is '2010-07-04' to '2010-07-31' ###
    if month_indicator == '2010-07-31':
        temp_date_range = get_date_list('2010-07-04', '2010-07-31', freq='D')
        df_1 = spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(csv_schema).csv( "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/ios/sbe_est_app/*/"%",".join(temp_date_range), sep="\t").withColumn("platform", F.lit("ios")).select('id', 'store_id', 'category_id', 'platform_id', 'vertical', 'rank', 'feed', 'est', 'date', 'platform').cache()
        

    ### 2. only csv
    elif month_indicator > '2010-08-01' and month_indicator < '2019-07-01':
        df_1 = spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(csv_schema).csv("s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/ios/sbe_est_app/*/"%",".join(test_data[1]), sep="\t").withColumn("platform", F.lit("ios")).select('id', 'store_id', 'category_id', 'platform_id', 'vertical', 'rank', 'feed', 'est', 'date', 'platform').cache()
        

    ### 3. half is csv, half is parquet ###
    elif month_indicator == '2019-07-31':
        # First half of 2019-07
        temp_date_range = get_date_list('2019-07-01', '2019-07-14')
        first_half_month_df = spark.read.option("basePath", "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(csv_schema).csv("s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/ios/sbe_est_app/*/"%",".join(temp_date_range), sep="\t").withColumn("platform", F.lit("ios")).select('id', 'store_id', 'category_id', 'platform_id', 'vertical', 'rank', 'feed', 'est', 'date', 'platform').cache()
        # Second half of 2019-07
        temp_date_range = get_date_list('2019-07-15', '2019-07-31')
        second_half_month = spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/").parquet("s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/date={%s}/platform=*/*/" %  ",".join(temp_date_range)).select('id', 'store_id', 'category_id', 'platform_id', 'vertical', 'rank', 'feed', 'est', 'date', 'platform').cache()
        df_1 = first_half_month_df.union(second_half_month)
        

    ### 4. only parquet ###
    else:  # month_indicator >= '2019-08-31'
        df_1 = spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/").parquet("s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/date={%s}/platform=*/*/" % (test_data)).cache()

    df_1.createOrReplaceTempView("daily_data")
    
    daily_est_load = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-load.v3/fact/").where("granularity='daily' and date='{}'".format(test_data)).cache()
    daily_est_load.createOrReplaceTempView("daily_unified_est")


    daily_download_attr = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.download-attribution.v3/fact/").where("granularity='daily' and date='{}'".format(test_data)).cache()
    daily_download_attr.createOrReplaceTempView("daily_unified_attr_est")

    daily_download_attr = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.download-attribution-load.v3/fact/").where("granularity='daily' and date='{}'".format(test_data)).cache()
    daily_download_attr.createOrReplaceTempView("daily_pre_load_data")


    sql_text = """
    
    WITH filter_top_N_raw_data AS(
    SELECT
     distinct
      id,
      Sum(est) AS est,
      store_id,
      platform_id,
      feed,
      vertical,
      platform
    FROM
      (
        SELECT
          DISTINCT d1.id,
          d1.est,
          d1.store_id,
          d1.date,
          d1.feed,
          d1.vertical,
          d1.platform_id,
          d1.platform
        FROM
          daily_data AS d1
          JOIN daily_data AS d2 
          ON d1.id = d2.id
          AND d1.store_id = d2.store_id
          AND d1.feed = d2.feed
          AND d1.vertical = d2.vertical
          AND d1.platform_id = d2.platform_id
        WHERE (d1.rank <= 4000 and d2.rank<=4000 and d1.store_id == 0 and d1.platform = 'ios' ) 
            OR (d1.rank <= 1000 and d2.rank<=1000 and  d1.store_id != 0 and d1.platform = 'ios' )
            OR  (d1.rank <= 4000 and d2.rank<=4000 and d1.store_id == 1000 and d1.platform = 'android' ) 
            OR (d1.rank <= 1000 and d2.rank<=1000 and  d1.store_id != 1000 and d1.platform = 'android' )
      ) AS t
    WHERE
      feed IN (
        0,
        1,
        2,
        101,
        100,
        102
      )
    GROUP BY
      id,
      store_id,
      platform_id,
      vertical,
      feed,
      platform);
      
     WITH replace_metric AS (
     SELECT * ,
         case 
        when feed='0' and platform='ios' then 'free_app_download'
        when feed='1' and platform='ios' then 'paid_app_download'
        when feed='2' and platform='ios' then 'revenue' 
        when feed='101' and platform='ios' then 'free_app_download' 
        when feed='100' and platform='ios' then 'paid_app_download' 
        when feed='102' and platform='ios' then 'revenue' 
        when feed='0' and platform='android' then 'free_app_download' 
        when feed='1' and platform='android' then 'paid_app_download' 
        when feed='2' and platform='android' then 'revenue' 
        end as metric from filter_top_N_raw_data);
      
      
         WITH replace_metric_device_code AS (
        SELECT * ,
         case 
        when feed='0' and platform='ios' then 'ios-phone'
        when feed='1' and platform='ios' then 'ios-phone'
        when feed='2' and platform='ios' then 'ios-phone' 
        when feed='101' and platform='ios' then 'ios-tablet' 
        when feed='100' and platform='ios' then 'ios-tablet' 
        when feed='102' and platform='ios' then 'ios-tablet' 
        when feed='0' and platform='android' then 'android-all' 
        when feed='1' and platform='android' then 'android-all' 
        when feed='2' and platform='android' then 'android-all' 
        end as device_code from replace_metric);


    WITH group_by_metric_1 AS (
        SELECT max(est) as est, id, metric,device_code, store_id, platform from replace_metric_device_code where store_id not in (3,4,5,6) and device_code in ('ios-phone' ,'ios-tablet' ) and feed not in (1000, 1001, 1002) group by id, store_id, metric,device_code, platform
        );
        
    WITH group_by_metric_2 AS (
        SELECT max(est) as est, id, metric,device_code, store_id, platform from replace_metric_device_code where store_id not in ( 1003, 1005, 1006,1007) and device_code='android-all' and feed not in (1000, 1001, 1002) group by id, store_id, metric,device_code, platform
    );
    
    WITH group_by_metric AS(
        SELECT * FROM group_by_metric_1
        UNION ALL
        SELECT * FROM group_by_metric_2
        );

      -- pivot metric column
    WITH pivot_metric_raw AS (

    SELECT 
        distinct id as app_id, store_id, platform, device_code, free_app_download,revenue, paid_app_download
    FROM
          group_by_metric
     PIVOT (
        max(est) 
    	FOR metric IN ('free_app_download','revenue', 'paid_app_download')
      )
    );
    
    
    -- union all platform with country_code mapping

    WITH country_code_mapping AS (
    select *, 'android' as market_code from android_country_mapping 
    UNION ALL select *, 'ios' market_code from ios_country_mapping
    UNION ALL select 143502, 'VE', 'VESA', 'ios'
    UNION ALL select 0, 'WW', 'worldwide', 'ios'
    UNION ALL select 36, 'CZ', 'CZ', 'android'
    UNION ALL select 5, 'ES', 'ES', 'android'

    );



    -- map raw with country_code

    WITH country_category_mapping_raw AS (
    select app_id, country_code, device_code, free_app_download, paid_app_download, revenue 
     from country_code_mapping 
     inner join 
         pivot_metric_raw 
     on 
         country_code_mapping.store_id=pivot_metric_raw.store_id 
     and 
         country_code_mapping.market_code=pivot_metric_raw.platform
    where country_name!='Global'
    );

    WITH download_unified_attr AS(
    SELECT app_id, coalesce(free_app_download, 0 ) as free_app_download, coalesce(paid_app_download, 0 ) as paid_app_download, coalesce(revenue, 0 ) as revenue, device_code, country_code from country_category_mapping_raw

    );
    
    
    WITH daily_unified_attr_est_join_est_table AS(
    SELECT a.app_id, a.free_app_download, a.paid_app_download, a.revenue, a.country_code, a.device_code, b.organic_download_share
    FROM download_unified_attr a
    JOIN daily_unified_attr_est b
    ON a.app_id=b.app_id
    AND a.country_code=b.country_code
    AND a.device_code=b.device_code
    
    );
    
    
    WITH caculate_data AS (
        SELECT CAST(ROUND (organic_download_share * (free_app_download+paid_app_download)) AS int) AS est_organic_download, 
        CAST(free_app_download+paid_app_download - ROUND(organic_download_share * (free_app_download+paid_app_download)) AS int) as est_paid_download,
        app_id, 
        free_app_download AS est_free_app_download, 
        paid_app_download AS est_paid_app_download, 
        revenue AS est_revenue, 
        device_code, 
        country_code
        FROM daily_unified_attr_est_join_est_table
    );


    WITH daily_pre_load_data_coalesce AS(
    SELECT app_id, coalesce(est_paid_download, 0 ) as est_paid_download, coalesce(est_organic_download, 0 ) as est_organic_download, coalesce(est_free_app_download, 0 ) as est_free_app_download, coalesce(est_paid_app_download, 0 ) as est_paid_app_download, coalesce(est_revenue, 0 ) as est_revenue, device_code, country_code from daily_pre_load_data

    );
    

      """
    
    # store_unified , rank_unified
    namespace = "aa.store.market-size.v1"
    ingest_msg = {
        "namespace": "aa.store.market-size.v1",
        "job_type": "routine",
        "options": {},
        "source": [
            {
            "data_encoding": "csv",
            "compression": "gzip",
            "name":"ios_country_mapping",
            "data_schema": [
                            {"name":"store_id","type":"int","nullable": False},
                            {"name":"country_code","type":"string","nullable": False},
                            {"name":"country_name","type":"string","nullable": False}
                            ],
             "csv_options": {
          'header': True,
          'sep': '\t',
          'quote': '',
          'encoding': 'utf-8',
          'escape': ''
          },

            "path": ["s3://b2c-prod-dca-store-estimates/store_back/dimension/IOS_COUNTRY_MAPPING"],
        },   
        {
            "data_encoding": "csv",
            "compression": "gzip",
            "name":"android_country_mapping",
            "data_schema": [
                            {"name":"store_id","type":"int","nullable": False},
                            {"name":"country_code","type":"string","nullable": False},
                            {"name":"country_name","type":"string","nullable": False}
                            ],
             "csv_options": {
          'header': True,
          'sep': '\t',
          'quote': '',
          'encoding': 'utf-8',
          'escape': ''
          },

            "path": ["s3://b2c-prod-dca-store-estimates/store_back/dimension/ANDROID_COUNTRY_MAPPING"],
        }
        ]
    }
    
    run(spark, ingest_msg, sql_text)
    # eject_all_caches(spark)


sc.parallelize(map(test_monthkly_data, dates), 1)

# spark.sql("select * from caculate_data").show()
  
diff_df1 = spark.sql("select app_id, est_free_app_download, est_paid_app_download, est_organic_download, est_paid_download, est_revenue, device_code, country_code from caculate_data except all select app_id, est_free_app_download, est_paid_app_download, est_organic_download, est_paid_download, est_revenue, device_code, country_code from daily_pre_load_data_coalesce ")
diff_df2 = spark.sql("select app_id, est_free_app_download, est_paid_app_download, est_organic_download, est_paid_download, est_revenue, device_code, country_code from daily_pre_load_data_coalesce except all select app_id, est_free_app_download, est_paid_app_download, est_organic_download, est_paid_download, est_revenue, device_code, country_code from caculate_data ")

diff_df1.show()
diff_df2.show() 


In [0]:

import datetime
from pyspark.sql import types as T
from pyspark.sql import functions as F

from aadatapipelinecore.core.urn import Urn
from aadatapipelinecore.core.pipeline import type_
from applications.common.parser import SqlParser
from applications.common.executor import SqlExecutor
from applications.auto_pipeline.transform import _view
from aadatapipelinecore.core.utils.spark import eject_all_caches
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

spark.sparkContext.addPyFile("/home/hadoop/bdp/application/libs/python/dependencies.zip")
import aaplproxy


start = "2019-07-14"
end = "2020-05-31"
# end = "2012-05-01"
real_date1 = datetime.date(*[int(x) for x in start.split('-')])
real_date2 = datetime.date(*[int(x) for x in end.split('-')])
date_range = real_date2 - real_date1
dates = list()
for days in xrange(date_range.days):
    dates.append((str(real_date1 + datetime.timedelta(days))))
dates.sort(reverse=True)




class DryRunSqlExecutor(SqlExecutor):
    def _verify_tasks(self):
        pass


def run(spark, raw_data, sql_text, dry_run=True):
    urn = Urn(namespace=raw_data["namespace"])
    source_data_list = raw_data.pop("source")
    raw_data.update(raw_data.pop("options"))
    _view(spark, sql_text, None, source_data_list)
    context = raw_data
    tasks = SqlParser(spark, sql_text, context).parse()
    if dry_run:
        sql_executor = DryRunSqlExecutor
    else:
        sql_executor = SqlExecutor
    sql_executor(urn, spark, tasks, type_.EventType.TRANSFORM, context).run()



# CSV schema
from pyspark.sql import types as T
from pyspark.sql import functions as F

csv_schema = T.StructType(
    [
        T.StructField("store_id", T.IntegerType(), True),
        T.StructField("date", T.DateType(), True),
        T.StructField("platform_id", T.IntegerType(), True),
        T.StructField("vertical", T.IntegerType(), True),
        T.StructField("feed", T.IntegerType(), True),
        T.StructField("id", T.LongType(), True),
        T.StructField("est", T.IntegerType(), True),
        T.StructField("category_id", T.IntegerType(), True),
        T.StructField("rank", T.IntegerType(), True)
    ]
)

def test_monthly_category_data(test_data):
    print test_data

    month_indicator = test_data
    ### 1. only csv, but date range is '2010-07-04' to '2010-07-31' ###
    if month_indicator == '2010-07-31':
        temp_date_range = get_date_list('2010-07-04', '2010-07-31', freq='D')
        df_1 = spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(csv_schema).csv( "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/ios/sbe_est_app/*/"%",".join(temp_date_range), sep="\t").withColumn("platform", F.lit("ios")).select('id', 'store_id', 'category_id', 'platform_id', 'vertical', 'rank', 'feed', 'est', 'date', 'platform').cache()
        

    ### 2. only csv
    elif month_indicator > '2010-08-01' and month_indicator < '2019-07-01':
        df_1 = spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(csv_schema).csv("s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/ios/sbe_est_app/*/"%",".join(test_data[1]), sep="\t").withColumn("platform", F.lit("ios")).select('id', 'store_id', 'category_id', 'platform_id', 'vertical', 'rank', 'feed', 'est', 'date', 'platform').cache()
        

    ### 3. half is csv, half is parquet ###
    elif month_indicator == '2019-07-31':
        # First half of 2019-07
        temp_date_range = get_date_list('2019-07-01', '2019-07-14')
        first_half_month_df = spark.read.option("basePath", "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(csv_schema).csv("s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/ios/sbe_est_app/*/"%",".join(temp_date_range), sep="\t").withColumn("platform", F.lit("ios")).select('id', 'store_id', 'category_id', 'platform_id', 'vertical', 'rank', 'feed', 'est', 'date', 'platform').cache()
        # Second half of 2019-07
        temp_date_range = get_date_list('2019-07-15', '2019-07-31')
        second_half_month = spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/").parquet("s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/date={%s}/platform=*/*/" %  ",".join(temp_date_range)).select('id', 'store_id', 'category_id', 'platform_id', 'vertical', 'rank', 'feed', 'est', 'date', 'platform').cache()
        df_1 = first_half_month_df.union(second_half_month)
        

    ### 4. only parquet ###
    else:  # month_indicator >= '2019-08-31'
        df_1 = spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/").parquet("s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/date={%s}/platform=*/*/" %  (test_data)) .cache()

    df_1.createOrReplaceTempView("daily_data")
    
    daily_category_unified = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-load.v3/fact/").where("granularity='daily' and date='{}'".format(test_data)).cache()
    daily_category_unified.createOrReplaceTempView("daily_unified_category")


    
    
    # /date=2020-03-28
    sql_text = """
     WITH filter_top_N_raw_data AS(
    SELECT
  *
FROM
  (
    SELECT
      id,
      Sum(est) AS est,
      category_id,
      store_id,
      platform_id,
      feed,
      vertical,
      platform
    FROM
      (
        SELECT
          DISTINCT d1.id,
          d1.est,
          d1.store_id,
          d1.date,
          d1.feed,
          d1.vertical,
          d1.platform_id,
          d1.platform,
          d2.category_id
        FROM
          daily_data AS d1
          JOIN daily_data AS d2 
          ON d1.id = d2.id
          AND d1.store_id = d2.store_id
          AND d1.feed = d2.feed
          AND d1.vertical = d2.vertical
          AND d1.platform_id = d2.platform_id
          AND d1.platform = d2.platform
        WHERE (d1.rank <= 4000 and d2.rank<=4000 and d1.store_id == 0 and d1.platform = 'ios' ) 
            OR (d1.rank <= 1000 and d2.rank<=1000 and  d1.store_id != 0 and d1.platform = 'ios' )
            OR  (d1.rank <= 4000 and d2.rank<=4000 and d1.store_id == 1000 and d1.platform = 'android' ) 
            OR (d1.rank <= 1000 and d2.rank<=1000 and  d1.store_id != 1000 and d1.platform = 'android' )

      ) AS t
    WHERE
      feed IN (
        0,
        1,
        2,
        101,
        100,
        102
      )
    GROUP BY
      id,
      store_id,
      category_id,
      platform_id,
      platform,
      vertical,
      feed
  ) 
     );

      
     WITH replace_metric AS (
     SELECT * ,
         case 
        when feed='0' and platform='ios' then 'free_app_download'
        when feed='1' and platform='ios' then 'paid_app_download'
        when feed='2' and platform='ios' then 'revenue' 
        when feed='101' and platform='ios' then 'free_app_download' 
        when feed='100' and platform='ios' then 'paid_app_download' 
        when feed='102' and platform='ios' then 'revenue' 
        when feed='0' and platform='android' then 'free_app_download' 
        when feed='1' and platform='android' then 'paid_app_download' 
        when feed='2' and platform='android' then 'revenue' 
        end as metric from filter_top_N_raw_data);
      
      
         WITH replace_metric_device_code AS (
        SELECT * ,
         case 
        when feed='0' and platform='ios' then 'ios-phone'
        when feed='1' and platform='ios' then 'ios-phone'
        when feed='2' and platform='ios' then 'ios-phone' 
        when feed='101' and platform='ios' then 'ios-tablet' 
        when feed='100' and platform='ios' then 'ios-tablet' 
        when feed='102' and platform='ios' then 'ios-tablet' 
        when feed='0' and platform='android' then 'android-all' 
        when feed='1' and platform='android' then 'android-all' 
        when feed='2' and platform='android' then 'android-all' 
        end as device_code from replace_metric);


    WITH group_by_metric_1 AS (
        SELECT max(est) as est, id, metric,device_code, store_id, platform, category_id from replace_metric_device_code where store_id not in (3,4,5,6) and device_code in ('ios-phone' ,'ios-tablet' ) and feed not in (1000, 1001, 1002) group by id, store_id, metric,device_code, platform, category_id
        );
        
    WITH group_by_metric_2 AS (
        SELECT max(est) as est, id, metric,device_code, store_id, platform, category_id from replace_metric_device_code where store_id not in ( 1003, 1005, 1006,1007) and device_code='android-all' and feed not in (1000, 1001, 1002) group by id, store_id, metric,device_code, platform, category_id
    );
    
    WITH group_by_metric AS(
        SELECT * FROM group_by_metric_1
        UNION ALL
        SELECT * FROM group_by_metric_2
        );

      -- pivot metric column
    WITH pivot_metric_raw AS (

    SELECT 
        distinct id as app_id, store_id, platform, device_code, free_app_download,revenue, paid_app_download, category_id as category_id_pivot
    FROM
          group_by_metric
     PIVOT (
        max(est) 
    	FOR metric IN ('free_app_download','revenue', 'paid_app_download')
      )
    );
    
    -- map raw with category
    WITH category_mapping_raw AS (

    SELECT * from 
        ( select *, 'ios' as mapping_platform from category_mapping_deminsion_service where market_code='apple-store' 
    UNION ALL select *, 'android' as mapping_platform from category_mapping_deminsion_service where market_code='google-play'
     ) as mapping 
    FULL OUTER JOIN pivot_metric_raw 
    ON 
     mapping.legacy_category_id=pivot_metric_raw.category_id_pivot 
    AND 
     mapping.mapping_platform=pivot_metric_raw.platform
    );
    
    
    
    -- union all platform with country_code mapping

    WITH country_code_mapping AS (
    select *, 'android' as market_code from android_country_mapping 
    UNION ALL select *, 'ios' market_code from ios_country_mapping
    UNION ALL select 143502, 'VE', 'VESA', 'ios'
    UNION ALL select 0, 'WW', 'worldwide', 'ios'
    UNION ALL select 36, 'CZ', 'CZ', 'android'
    UNION ALL select 5, 'ES', 'ES', 'android'

    );





    WITH country_category_mapping_raw AS (
    select app_id, country_code, device_code, free_app_download, paid_app_download, revenue, category_id 
     from country_code_mapping 
     inner join 
         category_mapping_raw 
     on 
         country_code_mapping.store_id=category_mapping_raw.store_id 
     and 
         country_code_mapping.market_code=category_mapping_raw.platform
    where country_name!='Global'
    );




    
      """
    
    # store_unified , rank_unified
    namespace = "aa.store.market-size.v1"
    ingest_msg = {
        "namespace": "aa.store.market-size.v1",
        "job_type": "routine",
        "options": {},
        "source": [
            # {
            #     "data_encoding": "parquet",
            #     "compression": "gzip",
            #     "name": "store_unified_weekly_data",
            #     "path": [
            #         "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category.v3/fact/granularity=weekly/date=2020-03-28/"],
            #     # "path": est_list,

            # }, 
            {
            "data_encoding": "csv",
            "compression": "gzip",
            "name":"ios_country_mapping",
            "data_schema": [
                            {"name":"store_id","type":"int","nullable": False},
                            {"name":"country_code","type":"string","nullable": False},
                            {"name":"country_name","type":"string","nullable": False}
                            ],
             "csv_options": {
           'header': True,
           'sep': '\t',
           'quote': '',
           'encoding': 'utf-8',
           'escape': ''
           },

            "path": ["s3://b2c-prod-dca-store-estimates/store_back/dimension/IOS_COUNTRY_MAPPING"],
        },   
        {
            "data_encoding": "csv",
            "compression": "gzip",
            "name":"android_country_mapping",
            "data_schema": [
                            {"name":"store_id","type":"int","nullable": False},
                            {"name":"country_code","type":"string","nullable": False},
                            {"name":"country_name","type":"string","nullable": False}
                            ],
             "csv_options": {
           'header': True,
           'sep': '\t',
           'quote': '',
           'encoding': 'utf-8',
           'escape': ''
           },

            "path": ["s3://b2c-prod-dca-store-estimates/store_back/dimension/ANDROID_COUNTRY_MAPPING"],
        },
        {
            "data_encoding": "parquet",
            "compression": "gzip",
            "name":"category_mapping_deminsion_service",
            "path": ["s3://b2c-prod-data-pipeline-qa/aa.store/store_cateogry_mapping"],
        }
        ]
    }
    
    run(spark, ingest_msg, sql_text)

    spark.sql("select * from country_category_mapping_raw except all select app_id, country_code, device_code, est_free_app_download as free_app_download, est_paid_app_download as paid_app_download, est_revenue as revenue, category_id from daily_unified_category ").show()
    spark.sql("select app_id, country_code, device_code, est_free_app_download as free_app_download, est_paid_app_download as paid_app_download, est_revenue as revenue, category_id from daily_unified_category  except all select * from country_category_mapping_raw").show()
    eject_all_caches(spark)

    

    
sc.parallelize(map(test_monthly_category_data, dates), 1)

In [0]:

import datetime
from pyspark.sql import types as T
from pyspark.sql import functions as F

from aadatapipelinecore.core.urn import Urn
from aadatapipelinecore.core.pipeline import type_
from applications.common.parser import SqlParser
from applications.common.executor import SqlExecutor
from applications.auto_pipeline.transform import _view
from aadatapipelinecore.core.utils.spark import eject_all_caches
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

spark.sparkContext.addPyFile("/home/hadoop/bdp/application/libs/python/dependencies.zip")
import aaplproxy


start = "2020-05-02"
end = "2020-05-03"
# end = "2012-05-01"
real_date1 = datetime.date(*[int(x) for x in start.split('-')])
real_date2 = datetime.date(*[int(x) for x in end.split('-')])
date_range = real_date2 - real_date1
dates = list()
for days in xrange(date_range.days):
    dates.append((str(real_date1 + datetime.timedelta(days))))




class DryRunSqlExecutor(SqlExecutor):
    def _verify_tasks(self):
        pass


def run(spark, raw_data, sql_text, dry_run=True):
    urn = Urn(namespace=raw_data["namespace"])
    source_data_list = raw_data.pop("source")
    raw_data.update(raw_data.pop("options"))
    _view(spark, sql_text, None, source_data_list)
    context = raw_data
    tasks = SqlParser(spark, sql_text, context).parse()
    if dry_run:
        sql_executor = DryRunSqlExecutor
    else:
        sql_executor = SqlExecutor
    sql_executor(urn, spark, tasks, type_.EventType.TRANSFORM, context).run()



# CSV schema
from pyspark.sql import types as T
from pyspark.sql import functions as F

csv_schema = T.StructType(
    [
        T.StructField("store_id", T.IntegerType(), True),
        T.StructField("date", T.DateType(), True),
        T.StructField("platform_id", T.IntegerType(), True),
        T.StructField("vertical", T.IntegerType(), True),
        T.StructField("feed", T.IntegerType(), True),
        T.StructField("id", T.LongType(), True),
        T.StructField("est", T.IntegerType(), True),
        T.StructField("category_id", T.IntegerType(), True),
        T.StructField("rank", T.IntegerType(), True)
    ]
)

def test_monthly_category_data(test_data):
    print test_data[0]
    print test_data[1]
    
    month_indicator = test_data
    ### 1. only csv, but date range is '2010-07-04' to '2010-07-31' ###
    if month_indicator == '2010-07-31':
        temp_date_range = get_date_list('2010-07-04', '2010-07-31', freq='D')
        df_1 = spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(csv_schema).csv( "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/ios/sbe_est_app/*/"%",".join(temp_date_range), sep="\t").withColumn("platform", F.lit("ios")).select('id', 'store_id', 'category_id', 'platform_id', 'vertical', 'rank', 'feed', 'est', 'date', 'platform').cache()
        

    ### 2. only csv
    elif month_indicator > '2010-08-01' and month_indicator < '2019-07-01':
        df_1 = spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(csv_schema).csv("s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/ios/sbe_est_app/*/"%",".join(test_data[1]), sep="\t").withColumn("platform", F.lit("ios")).select('id', 'store_id', 'category_id', 'platform_id', 'vertical', 'rank', 'feed', 'est', 'date', 'platform').cache()
        

    ### 3. half is csv, half is parquet ###
    elif month_indicator == '2019-07-31':
        # First half of 2019-07
        temp_date_range = get_date_list('2019-07-01', '2019-07-14')
        first_half_month_df = spark.read.option("basePath", "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(csv_schema).csv("s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/ios/sbe_est_app/*/"%",".join(temp_date_range), sep="\t").withColumn("platform", F.lit("ios")).select('id', 'store_id', 'category_id', 'platform_id', 'vertical', 'rank', 'feed', 'est', 'date', 'platform').cache()
        # Second half of 2019-07
        temp_date_range = get_date_list('2019-07-15', '2019-07-31')
        second_half_month = spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/").parquet("s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/date={%s}/platform=*/*/" %  ",".join(temp_date_range)).select('id', 'store_id', 'category_id', 'platform_id', 'vertical', 'rank', 'feed', 'est', 'date', 'platform').cache()
        df_1 = first_half_month_df.union(second_half_month)
        

    ### 4. only parquet ###
    else:  # month_indicator >= '2019-08-31'
        df_1 = spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/").parquet("s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/date={%s}/platform=*/*/" %  (test_data)) .cache()

    df_1.createOrReplaceTempView("daily_data")
    
    daily_category_unified = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-load.v3/fact/").where("granularity='daily' and date='{}'".format(test_data)).cache()
    daily_category_unified.createOrReplaceTempView("daily_unified_category")

    daily_download_attr = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.download-attribution.v3/fact/").where("granularity='daily' and date='{}'".format(test_data)).cache()
    daily_download_attr.createOrReplaceTempView("daily_unified_attr_est")

    daily_download_attr = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.download-attribution-category-load.v3/fact/").where("granularity='daily' and date='{}'".format(test_data)).cache()
    daily_download_attr.createOrReplaceTempView("daily_pre_load_data")

    
    
    # /date=2020-03-28
    sql_text = """
     WITH filter_top_N_raw_data AS(
    SELECT
  *
FROM
  (
    SELECT
      id,
      Sum(est) AS est,
      category_id,
      store_id,
      platform_id,
      feed,
      vertical,
      platform
    FROM
      (
        SELECT
          DISTINCT d1.id,
          d1.est,
          d1.store_id,
          d1.date,
          d1.feed,
          d1.vertical,
          d1.platform_id,
          d1.platform,
          d2.category_id
        FROM
          daily_data AS d1
          JOIN daily_data AS d2 
          ON d1.id = d2.id
          AND d1.store_id = d2.store_id
          AND d1.feed = d2.feed
          AND d1.vertical = d2.vertical
          AND d1.platform_id = d2.platform_id
          AND d1.platform = d2.platform
        WHERE (d1.rank <= 4000 and d2.rank<=4000 and d1.store_id == 0 and d1.platform = 'ios' ) 
            OR (d1.rank <= 1000 and d2.rank<=1000 and  d1.store_id != 0 and d1.platform = 'ios' )
            OR  (d1.rank <= 4000 and d2.rank<=4000 and d1.store_id == 1000 and d1.platform = 'android' ) 
            OR (d1.rank <= 1000 and d2.rank<=1000 and  d1.store_id != 1000 and d1.platform = 'android' )

      ) AS t
    WHERE
      feed IN (
        0,
        1,
        2,
        101,
        100,
        102
      )
    GROUP BY
      id,
      store_id,
      category_id,
      platform_id,
      platform,
      vertical,
      feed
  ) 
     );

      
     WITH replace_metric AS (
     SELECT * ,
         case 
        when feed='0' and platform='ios' then 'free_app_download'
        when feed='1' and platform='ios' then 'paid_app_download'
        when feed='2' and platform='ios' then 'revenue' 
        when feed='101' and platform='ios' then 'free_app_download' 
        when feed='100' and platform='ios' then 'paid_app_download' 
        when feed='102' and platform='ios' then 'revenue' 
        when feed='0' and platform='android' then 'free_app_download' 
        when feed='1' and platform='android' then 'paid_app_download' 
        when feed='2' and platform='android' then 'revenue' 
        end as metric from filter_top_N_raw_data);
      
      
         WITH replace_metric_device_code AS (
        SELECT * ,
         case 
        when feed='0' and platform='ios' then 'ios-phone'
        when feed='1' and platform='ios' then 'ios-phone'
        when feed='2' and platform='ios' then 'ios-phone' 
        when feed='101' and platform='ios' then 'ios-tablet' 
        when feed='100' and platform='ios' then 'ios-tablet' 
        when feed='102' and platform='ios' then 'ios-tablet' 
        when feed='0' and platform='android' then 'android-all' 
        when feed='1' and platform='android' then 'android-all' 
        when feed='2' and platform='android' then 'android-all' 
        end as device_code from replace_metric);


    WITH group_by_metric_1 AS (
        SELECT max(est) as est, id, metric,device_code, store_id, platform, category_id from replace_metric_device_code where store_id not in (3,4,5,6) and device_code in ('ios-phone' ,'ios-tablet' ) and feed not in (1000, 1001, 1002) group by id, store_id, metric,device_code, platform, category_id
        );
        
    WITH group_by_metric_2 AS (
        SELECT max(est) as est, id, metric,device_code, store_id, platform, category_id from replace_metric_device_code where store_id not in ( 1003, 1005, 1006,1007) and device_code='android-all' and feed not in (1000, 1001, 1002) group by id, store_id, metric,device_code, platform, category_id
    );
    
    WITH group_by_metric AS(
        SELECT * FROM group_by_metric_1
        UNION ALL
        SELECT * FROM group_by_metric_2
        );

      -- pivot metric column
    WITH pivot_metric_raw AS (

    SELECT 
        distinct id as app_id, store_id, platform, device_code, free_app_download,revenue, paid_app_download, category_id as category_id_pivot
    FROM
          group_by_metric
     PIVOT (
        max(est) 
    	FOR metric IN ('free_app_download','revenue', 'paid_app_download')
      )
    );
    
    -- map raw with category
    WITH category_mapping_raw AS (

    SELECT * from 
        ( select *, 'ios' as mapping_platform from category_mapping_deminsion_service where market_code='apple-store' 
    UNION ALL select *, 'android' as mapping_platform from category_mapping_deminsion_service where market_code='google-play'
     ) as mapping 
    FULL OUTER JOIN pivot_metric_raw 
    ON 
     mapping.legacy_category_id=pivot_metric_raw.category_id_pivot 
    AND 
     mapping.mapping_platform=pivot_metric_raw.platform
    );
    
    
    
    -- union all platform with country_code mapping

    WITH country_code_mapping AS (
    select *, 'android' as market_code from android_country_mapping 
    UNION ALL select *, 'ios' market_code from ios_country_mapping
    UNION ALL select 143502, 'VE', 'VESA', 'ios'
    UNION ALL select 0, 'WW', 'worldwide', 'ios'
    UNION ALL select 36, 'CZ', 'CZ', 'android'
    UNION ALL select 5, 'ES', 'ES', 'android'

    );





    WITH country_category_mapping_raw AS (
    select app_id, country_code, device_code, free_app_download, paid_app_download, revenue, category_id 
     from country_code_mapping 
     inner join 
         category_mapping_raw 
     on 
         country_code_mapping.store_id=category_mapping_raw.store_id 
     and 
         country_code_mapping.market_code=category_mapping_raw.platform
    where country_name!='Global'
    );





    WITH download_unified_attr AS(
    SELECT app_id, category_id, coalesce(free_app_download, 0 ) as free_app_download, coalesce(paid_app_download, 0 ) as paid_app_download, coalesce(revenue, 0 ) as revenue, device_code, country_code from country_category_mapping_raw

    );
    
    
    WITH daily_unified_attr_est_join_est_table AS(
    SELECT a.app_id, a.free_app_download, a.paid_app_download, a.revenue, a.country_code, a.device_code, a.category_id, b.organic_download_share
    FROM download_unified_attr a
    JOIN daily_unified_attr_est b
    ON a.app_id=b.app_id
    AND a.country_code=b.country_code
    AND a.device_code=b.device_code
    
    );
    
    
    WITH caculate_data AS (
        SELECT CAST(ROUND (organic_download_share * (free_app_download+paid_app_download)) AS int) AS est_organic_download, 
        CAST(free_app_download+paid_app_download - ROUND(organic_download_share * (free_app_download+paid_app_download)) AS int) as est_paid_download,
        app_id, 
        category_id,
        free_app_download AS est_free_app_download, 
        paid_app_download AS est_paid_app_download, 
        revenue AS est_revenue, 
        device_code, 
        country_code
        FROM daily_unified_attr_est_join_est_table
    );


    WITH daily_pre_load_data_coalesce AS(
    SELECT app_id, category_id, coalesce(est_paid_download, 0 ) as est_paid_download, coalesce(est_organic_download, 0 ) as est_organic_download, coalesce(est_free_app_download, 0 ) as est_free_app_download, coalesce(est_paid_app_download, 0 ) as est_paid_app_download, coalesce(est_revenue, 0 ) as est_revenue, device_code, country_code from daily_pre_load_data

    );


    
      """
    
    # store_unified , rank_unified
    namespace = "aa.store.market-size.v1"
    ingest_msg = {
        "namespace": "aa.store.market-size.v1",
        "job_type": "routine",
        "options": {},
        "source": [
            # {
            #     "data_encoding": "parquet",
            #     "compression": "gzip",
            #     "name": "store_unified_weekly_data",
            #     "path": [
            #         "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category.v3/fact/granularity=weekly/date=2020-03-28/"],
            #     # "path": est_list,

            # }, 
            {
            "data_encoding": "csv",
            "compression": "gzip",
            "name":"ios_country_mapping",
            "data_schema": [
                            {"name":"store_id","type":"int","nullable": False},
                            {"name":"country_code","type":"string","nullable": False},
                            {"name":"country_name","type":"string","nullable": False}
                            ],
             "csv_options": {
           'header': True,
           'sep': '\t',
           'quote': '',
           'encoding': 'utf-8',
           'escape': ''
           },

            "path": ["s3://b2c-prod-dca-store-estimates/store_back/dimension/IOS_COUNTRY_MAPPING"],
        },   
        {
            "data_encoding": "csv",
            "compression": "gzip",
            "name":"android_country_mapping",
            "data_schema": [
                            {"name":"store_id","type":"int","nullable": False},
                            {"name":"country_code","type":"string","nullable": False},
                            {"name":"country_name","type":"string","nullable": False}
                            ],
             "csv_options": {
           'header': True,
           'sep': '\t',
           'quote': '',
           'encoding': 'utf-8',
           'escape': ''
           },

            "path": ["s3://b2c-prod-dca-store-estimates/store_back/dimension/ANDROID_COUNTRY_MAPPING"],
        },
        {
            "data_encoding": "parquet",
            "compression": "gzip",
            "name":"category_mapping_deminsion_service",
            "path": ["s3://b2c-prod-data-pipeline-qa/aa.store/store_cateogry_mapping"],
        }
        ]
    }
    
    run(spark, ingest_msg, sql_text)

    diff_df1 = spark.sql("select app_id, category_id, est_free_app_download, est_paid_app_download, est_organic_download, est_paid_download, est_revenue, device_code, country_code from caculate_data except all select app_id, category_id, est_free_app_download, est_paid_app_download, est_organic_download, est_paid_download, est_revenue, device_code, country_code from daily_pre_load_data_coalesce ")
    diff_df2 = spark.sql("select app_id, category_id, est_free_app_download, est_paid_app_download, est_organic_download, est_paid_download, est_revenue, device_code, country_code from daily_pre_load_data_coalesce except all select app_id, category_id, est_free_app_download, est_paid_app_download, est_organic_download, est_paid_download, est_revenue, device_code, country_code from caculate_data ")

    diff_df1.show()
    diff_df2.show() 

    

    
sc.parallelize(map(test_monthly_category_data, dates), 1)

In [0]:

spark.sql("select * from daily_pre_load_data_coalesce where country_code='US' and app_id=341456761").show()

In [0]:

import datetime
from pyspark.sql import types as T
from pyspark.sql import functions as F

from aadatapipelinecore.core.urn import Urn
from aadatapipelinecore.core.pipeline import type_
from applications.common.parser import SqlParser
from applications.common.executor import SqlExecutor
from applications.auto_pipeline.transform import _view
from aadatapipelinecore.core.utils.spark import eject_all_caches
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

spark.sparkContext.addPyFile("/home/hadoop/bdp/application/libs/python/dependencies.zip")
import aaplproxy


start = "2014-12-28"
end = "2017-01-01"
# end = "2012-05-01"
real_date1 = datetime.date(*[int(x) for x in start.split('-')])
real_date2 = datetime.date(*[int(x) for x in end.split('-')])
date_range = real_date2 - real_date1
dates = list()
sar_list=list()
for days in xrange(date_range.days):
    dates.append(real_date1 + datetime.timedelta(days))
    if (real_date1 + datetime.timedelta(days)).weekday() == 5:
        temp=list()
        while dates:
            temp.append(str(dates.pop()))
        sar_list.append((real_date1 + datetime.timedelta(days), temp))

# test_path=list()


        

print sar_list[0][0]



class DryRunSqlExecutor(SqlExecutor):
    def _verify_tasks(self):
        pass


def run(spark, raw_data, sql_text, dry_run=True):
    urn = Urn(namespace=raw_data["namespace"])
    source_data_list = raw_data.pop("source")
    raw_data.update(raw_data.pop("options"))
    _view(spark, sql_text, None, source_data_list)
    context = raw_data
    tasks = SqlParser(spark, sql_text, context).parse()
    if dry_run:
        sql_executor = DryRunSqlExecutor
    else:
        sql_executor = SqlExecutor
    sql_executor(urn, spark, tasks, type_.EventType.TRANSFORM, context).run()



csv_schema = T.StructType(
    [
        T.StructField("store_id", T.IntegerType(), True),
        T.StructField("date", T.DateType(), True),
        T.StructField("platform_id", T.IntegerType(), True),
        T.StructField("vertical", T.IntegerType(), True),
        T.StructField("feed", T.IntegerType(), True),
        T.StructField("id", T.LongType(), True),
        T.StructField("est", T.IntegerType(), True),
        T.StructField("category_id", T.IntegerType(), True),
        T.StructField("rank", T.IntegerType(), True)
    ]
)




def test_weekly_data(test_data):
    print test_data
    
    month_indicator = str(test_data[0]) # DT.strptime(test_data[0], '%Y-%m-%d')

    ### 1. only csv, but date range is '2010-07-04' to '2010-07-31' ###
    if month_indicator == '2010-07-31':
        temp_date_range = get_date_list('2010-07-04', '2010-07-31', freq='D')
        df_1 = spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(csv_schema).csv( "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/ios/sbe_est_app/*/"%",".join(temp_date_range), sep="\t").withColumn("platform", F.lit("ios")).select('id', 'store_id', 'category_id', 'platform_id', 'vertical', 'rank', 'feed', 'est', 'date', 'platform').cache()
        

    ### 2. only csv
    elif month_indicator > '2010-08-01' and month_indicator < '2013-01-01':
        df_1 = spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(csv_schema).csv("s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/ios/sbe_est_app/*/"%",".join(test_data[1]), sep="\t").withColumn("platform", F.lit("ios")).select('id', 'store_id', 'category_id', 'platform_id', 'vertical', 'rank', 'feed', 'est', 'date', 'platform').cache()


    ### 2. only csv
    elif month_indicator > '2013-01-01' and month_indicator < '2019-07-01':
        df_ios = spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(csv_schema).csv("s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/ios/sbe_est_app/*/"%",".join(test_data[1]), sep="\t").withColumn("platform", F.lit("ios")).select('id', 'store_id', 'category_id', 'platform_id', 'vertical', 'rank', 'feed', 'est', 'date', 'platform').cache()
        df_android = spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(csv_schema).csv("s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/android/sbe_est_app/*/"%",".join(test_data[1]), sep="\t").withColumn("platform", F.lit("android")).select('id', 'store_id', 'category_id', 'platform_id', 'vertical', 'rank', 'feed', 'est', 'date', 'platform').cache()
        df_1 = df_ios.union(df_android)

    ### 3. half is csv, half is parquet ###
    elif month_indicator == '2019-07-31':
        # First half of 2019-07
        temp_date_range = get_date_list('2019-07-01', '2019-07-14')
        first_half_month_df = spark.read.option("basePath", "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(csv_schema).csv("s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/ios/sbe_est_app/*/"%",".join(temp_date_range), sep="\t").withColumn("platform", F.lit("ios")).select('id', 'store_id', 'category_id', 'platform_id', 'vertical', 'rank', 'feed', 'est', 'date', 'platform').cache()
        # Second half of 2019-07
        temp_date_range = get_date_list('2019-07-15', '2019-07-31')
        second_half_month = spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/").parquet("s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/date={%s}/platform=*/*/" %  ",".join(temp_date_range)).select('id', 'store_id', 'category_id', 'platform_id', 'vertical', 'rank', 'feed', 'est', 'date', 'platform').cache()
        df_1 = first_half_month_df.union(second_half_month)
        

    ### 4. only parquet ###
    else:  # month_indicator >= '2019-08-31'
        df_1 = spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/").parquet("s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/date={%s}/platform=*/*/" %  ",".join(test_data[1])) .cache()

    df_1.createOrReplaceTempView("daily_data")
    
    print test_data[0]
    weekly_df_ho = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v3/fact/").where("granularity='weekly' and date='{}' and data_stage='final'".format(test_data[0])).cache()
    weekly_df_ho.createOrReplaceTempView("unified_weekly")


    sql_text = """
    
    WITH filter_top_N_raw_data AS(
    SELECT
     distinct
      id,
      Sum(est) AS est,
      store_id,
      platform_id,
      feed,
      vertical,
      platform
    FROM
      (
        SELECT
          DISTINCT d1.id,
          d1.est,
          d1.store_id,
          d1.date,
          d1.feed,
          d1.vertical,
          d1.platform_id,
          d1.platform
        FROM
          daily_data AS d1
          JOIN daily_data AS d2 
          ON d1.id = d2.id
          AND d1.store_id = d2.store_id
          AND d1.feed = d2.feed
          AND d1.vertical = d2.vertical
          AND d1.platform_id = d2.platform_id
        WHERE (d1.rank <= 4000 and d2.rank<=4000 and d1.store_id == 0 and d1.platform = 'ios' ) 
            OR (d1.rank <= 1000 and d2.rank<=1000 and  d1.store_id != 0 and d1.platform = 'ios' )
            OR  (d1.rank <= 4000 and d2.rank<=4000 and d1.store_id == 1000 and d1.platform = 'android' ) 
            OR (d1.rank <= 1000 and d2.rank<=1000 and  d1.store_id != 1000 and d1.platform = 'android' )
      ) AS t
    WHERE
      feed IN (
        0,
        1,
        2,
        101,
        100,
        102
      )
    GROUP BY
      id,
      store_id,
      platform_id,
      vertical,
      feed,
      platform);
      
     WITH replace_metric AS (
     SELECT * ,
         case 
        when feed='0' and platform='ios' then 'free_app_download'
        when feed='1' and platform='ios' then 'paid_app_download'
        when feed='2' and platform='ios' then 'revenue' 
        when feed='101' and platform='ios' then 'free_app_download' 
        when feed='100' and platform='ios' then 'paid_app_download' 
        when feed='102' and platform='ios' then 'revenue' 
        when feed='0' and platform='android' then 'free_app_download' 
        when feed='1' and platform='android' then 'paid_app_download' 
        when feed='2' and platform='android' then 'revenue' 
        end as metric from filter_top_N_raw_data);
      
      
         WITH replace_metric_device_code AS (
        SELECT * ,
         case 
        when feed='0' and platform='ios' then 'ios-phone'
        when feed='1' and platform='ios' then 'ios-phone'
        when feed='2' and platform='ios' then 'ios-phone' 
        when feed='101' and platform='ios' then 'ios-tablet' 
        when feed='100' and platform='ios' then 'ios-tablet' 
        when feed='102' and platform='ios' then 'ios-tablet' 
        when feed='0' and platform='android' then 'android-all' 
        when feed='1' and platform='android' then 'android-all' 
        when feed='2' and platform='android' then 'android-all' 
        end as device_code from replace_metric);


    WITH group_by_metric_1 AS (
        SELECT max(est) as est, id, metric,device_code, store_id, platform from replace_metric_device_code where store_id not in (3,4,5,6) and device_code in ('ios-phone' ,'ios-tablet' ) and feed not in (1000, 1001, 1002) group by id, store_id, metric,device_code, platform
        );
        
    WITH group_by_metric_2 AS (
        SELECT max(est) as est, id, metric,device_code, store_id, platform from replace_metric_device_code where store_id not in ( 1003, 1005, 1006,1007) and device_code='android-all' and feed not in (1000, 1001, 1002) group by id, store_id, metric,device_code, platform
    );
    
    WITH group_by_metric AS(
        SELECT * FROM group_by_metric_1
        UNION ALL
        SELECT * FROM group_by_metric_2
        );

      -- pivot metric column
    WITH pivot_metric_raw AS (

    SELECT 
        distinct id as app_id, store_id, platform, device_code, free_app_download,revenue, paid_app_download
    FROM
          group_by_metric
     PIVOT (
        max(est) 
    	FOR metric IN ('free_app_download','revenue', 'paid_app_download')
      )
    );
    
    
    -- union all platform with country_code mapping

    WITH country_code_mapping AS (
    select *, 'android' as market_code from android_country_mapping 
    UNION ALL select *, 'ios' market_code from ios_country_mapping
    UNION ALL select 143502, 'VE', 'VESA', 'ios'
    UNION ALL select 0, 'WW', 'worldwide', 'ios'
    UNION ALL select 36, 'CZ', 'CZ', 'android'
    UNION ALL select 5, 'ES', 'ES', 'android'

    );



    -- map raw with country_code

    WITH country_category_mapping_raw AS (
    select app_id, country_code, device_code, free_app_download, paid_app_download, revenue 
     from country_code_mapping 
     inner join 
         pivot_metric_raw 
     on 
         country_code_mapping.store_id=pivot_metric_raw.store_id 
     and 
         country_code_mapping.market_code=pivot_metric_raw.platform
    where country_name!='Global'
    );


      """
    
    # store_unified , rank_unified
    namespace = "aa.store.market-size.v1"
    ingest_msg = {
        "namespace": "aa.store.market-size.v1",
        "job_type": "routine",
        "options": {},
        "source": [
            {
            "data_encoding": "csv",
            "compression": "gzip",
            "name":"ios_country_mapping",
            "data_schema": [
                            {"name":"store_id","type":"int","nullable": False},
                            {"name":"country_code","type":"string","nullable": False},
                            {"name":"country_name","type":"string","nullable": False}
                            ],
             "csv_options": {
           'header': True,
           'sep': '\t',
           'quote': '',
           'encoding': 'utf-8',
           'escape': ''
           },

            "path": ["s3://b2c-prod-dca-store-estimates/store_back/dimension/IOS_COUNTRY_MAPPING"],
        },   
        {
            "data_encoding": "csv",
            "compression": "gzip",
            "name":"android_country_mapping",
            "data_schema": [
                            {"name":"store_id","type":"int","nullable": False},
                            {"name":"country_code","type":"string","nullable": False},
                            {"name":"country_name","type":"string","nullable": False}
                            ],
             "csv_options": {
           'header': True,
           'sep': '\t',
           'quote': '',
           'encoding': 'utf-8',
           'escape': ''
           },

            "path": ["s3://b2c-prod-dca-store-estimates/store_back/dimension/ANDROID_COUNTRY_MAPPING"],
        }
        ]
    }
    
    run(spark, ingest_msg, sql_text)
    
    spark.sql("select * from country_category_mapping_raw except all select app_id, country_code, device_code, free_app_download, paid_app_download, revenue from unified_weekly ").show()
    spark.sql("select app_id, country_code, device_code, free_app_download, paid_app_download, revenue from unified_weekly  except all select * from country_category_mapping_raw").show()
    
    eject_all_caches(spark)

    

    
sc.parallelize(map(test_weekly_data, sar_list), 1)




In [0]:

spark.sql("select * from country_category_mapping_raw where app_id=1079292817 and country_code='NG'  ").show()
spark.sql("select * from unified_weekly where app_id=1079292817 and country_code='NG'").show()

In [0]:


# spark.read.option("basePath","s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/").parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date=2018-05-{06,07,08,09,10,11,12}").where("app_id=1079292817 and country_code='NG'").show()

spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/2018-05-10/ios/sbe_est_app/143561/").csv("s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/2018-05-10/ios/sbe_est_app/143561/",sep="\t").where("_c5=1079292817").show()



In [0]:

# weekly_df_ha = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v3/fact/").where("granularity='weekly' and date='2020-03-28' and data_stage='final'").cache()
# weekly_df_ha.createOrReplaceTempView("unified_weekly")

# weekly_df_ho = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category.v3/fact/").where("granularity='weekly' and date='2020-03-28' and data_stage='final'").cache()
# weekly_df_ho.createOrReplaceTempView("unified_category_weekly")

spark.sql("select * from country_category_mapping_raw except all select app_id, country_code, device_code, free_app_download, paid_app_download, revenue from unified_weekly ").show()
spark.sql("select app_id, country_code, device_code, free_app_download, paid_app_download, revenue from unified_weekly  except all select * from country_category_mapping_raw").show()

In [0]:

import datetime
from datetime import datetime as DT

from pyspark.sql import types as T
from pyspark.sql import functions as F

from aadatapipelinecore.core.urn import Urn
from aadatapipelinecore.core.pipeline import type_
from applications.common.parser import SqlParser
from applications.common.executor import SqlExecutor
from applications.auto_pipeline.transform import _view
from aadatapipelinecore.core.utils.spark import eject_all_caches
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

spark.sparkContext.addPyFile("/home/hadoop/bdp/application/libs/python/dependencies.zip")
import aaplproxy


start = "2017-09-17"
end = "2019-07-01"
# end = "2012-05-01"
real_date1 = datetime.date(*[int(x) for x in start.split('-')])
real_date2 = datetime.date(*[int(x) for x in end.split('-')])
date_range = real_date2 - real_date1
dates = list()
sar_list=list()
for days in xrange(date_range.days):
    dates.append(real_date1 + datetime.timedelta(days))
    if (real_date1 + datetime.timedelta(days)).weekday() == 5:
        temp=list()
        while dates:
            temp.append(str(dates.pop()))
        sar_list.append((real_date1 + datetime.timedelta(days), temp))

# test_path=list()

csv_schema = T.StructType(
    [
        T.StructField("store_id", T.IntegerType(), True),
        T.StructField("date", T.DateType(), True),
        T.StructField("platform_id", T.IntegerType(), True),
        T.StructField("vertical", T.IntegerType(), True),
        T.StructField("feed", T.IntegerType(), True),
        T.StructField("id", T.LongType(), True),
        T.StructField("est", T.IntegerType(), True),
        T.StructField("category_id", T.IntegerType(), True),
        T.StructField("rank", T.IntegerType(), True)
    ]
)
        

"""
get date:  [ [month, [days]], [month, [days]], [month, [days]], ....... ]
"""
def get_date_list(start_date, end_date, freq="D"):
    import pandas as pd
    """
    freq:   D: calendar day frequency
            M: month end frequency
            MS: month start frequency
            A, Y: year end frequency
            AS, YS: year start frequency
    """
    date_list = [x.strftime('%Y-%m-%d') for x in list(pd.date_range(start=start_date, end=end_date, freq=freq))]
    return date_list

# start = "2010-07-31"  # prod
start = "2010-08-31"    # test
end = start

monthly = get_date_list(start, end, freq='M')
print monthly


date_list = []
for m in monthly:
    d = get_date_list(m[:8]+'01', m, freq='D')  # start = the first day of each month; end = each month
    month_and_day = [m, d]
    date_list.append(month_and_day)

print date_list


print sar_list[0][1]



class DryRunSqlExecutor(SqlExecutor):
    def _verify_tasks(self):
        pass


def run(spark, raw_data, sql_text, dry_run=True):
    urn = Urn(namespace=raw_data["namespace"])
    source_data_list = raw_data.pop("source")
    raw_data.update(raw_data.pop("options"))
    _view(spark, sql_text, None, source_data_list)
    context = raw_data
    tasks = SqlParser(spark, sql_text, context).parse()
    if dry_run:
        sql_executor = DryRunSqlExecutor
    else:
        sql_executor = SqlExecutor
    sql_executor(urn, spark, tasks, type_.EventType.TRANSFORM, context).run()





def test_weekly_data(test_data):
    print test_data[1]
    
    month_indicator = str(test_data[0]) # DT.strptime(test_data[0], '%Y-%m-%d')

    ### 1. only csv, but date range is '2010-07-04' to '2010-07-31' ###
    if month_indicator == '2010-07-31':
        temp_date_range = get_date_list('2010-07-04', '2010-07-31', freq='D')
        df_1 = spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(csv_schema).csv( "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/ios/sbe_est_app/*/"%",".join(temp_date_range), sep="\t").withColumn("platform", F.lit("ios")).select('id', 'store_id', 'category_id', 'platform_id', 'vertical', 'rank', 'feed', 'est', 'date', 'platform').cache()
        

    ### 2. only csv
    elif month_indicator > '2010-08-01' and month_indicator < '2013-01-01':
        df_1 = spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(csv_schema).csv("s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/ios/sbe_est_app/*/"%",".join(test_data[1]), sep="\t").withColumn("platform", F.lit("ios")).select('id', 'store_id', 'category_id', 'platform_id', 'vertical', 'rank', 'feed', 'est', 'date', 'platform').cache()


    ### 2. only csv
    elif month_indicator > '2013-01-01' and month_indicator < '2019-07-01':
        df_ios = spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(csv_schema).csv("s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/ios/sbe_est_app/*/"%",".join(test_data[1]), sep="\t").withColumn("platform", F.lit("ios")).select('id', 'store_id', 'category_id', 'platform_id', 'vertical', 'rank', 'feed', 'est', 'date', 'platform').cache()
        df_android = spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(csv_schema).csv("s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/android/sbe_est_app/*/"%",".join(test_data[1]), sep="\t").withColumn("platform", F.lit("android")).select('id', 'store_id', 'category_id', 'platform_id', 'vertical', 'rank', 'feed', 'est', 'date', 'platform').cache()
        df_1 = df_ios.union(df_android)

    ### 3. half is csv, half is parquet ###
    elif month_indicator == '2019-07-31':
        # First half of 2019-07
        temp_date_range = get_date_list('2019-07-01', '2019-07-14')
        first_half_month_df = spark.read.option("basePath", "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(csv_schema).csv("s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/ios/sbe_est_app/*/"%",".join(temp_date_range), sep="\t").withColumn("platform", F.lit("ios")).select('id', 'store_id', 'category_id', 'platform_id', 'vertical', 'rank', 'feed', 'est', 'date', 'platform').cache()
        # Second half of 2019-07
        temp_date_range = get_date_list('2019-07-15', '2019-07-31')
        second_half_month = spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/").parquet("s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/date={%s}/platform=*/*/" %  ",".join(temp_date_range)).select('id', 'store_id', 'category_id', 'platform_id', 'vertical', 'rank', 'feed', 'est', 'date', 'platform').cache()
        df_1 = first_half_month_df.union(second_half_month)
        

    ### 4. only parquet ###
    else:  # month_indicator >= '2019-08-31'
        df_1 = spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/").parquet("s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/date={%s}/platform=*/*/" %  ",".join(test_data[1])) .cache()

    df_1.createOrReplaceTempView("daily_data")
    
    weekly_df_ho = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category.v3/fact/").where("granularity='weekly' and date='{}' and data_stage='final'".format(test_data[0])).cache()
    weekly_df_ho.createOrReplaceTempView("unified_category_weekly")

    
    
    # /date=2020-03-28
    sql_text = """
     WITH filter_top_N_raw_data AS(
    SELECT
  *
FROM
  (
    SELECT
      id,
      Sum(est) AS est,
      category_id,
      store_id,
      platform_id,
      feed,
      vertical,
      platform
    FROM
      (
        SELECT
          DISTINCT d1.id,
          d1.est,
          d1.store_id,
          d1.date,
          d1.feed,
          d1.vertical,
          d1.platform_id,
          d1.platform,
          d2.category_id
        FROM
          daily_data AS d1
          JOIN daily_data AS d2 
          ON d1.id = d2.id
          AND d1.store_id = d2.store_id
          AND d1.feed = d2.feed
          AND d1.vertical = d2.vertical
          AND d1.platform_id = d2.platform_id
          AND d1.platform = d2.platform
        WHERE (d1.rank <= 4000 and d2.rank<=4000 and d1.store_id == 0 and d1.platform = 'ios' ) 
            OR (d1.rank <= 1000 and d2.rank<=1000 and  d1.store_id != 0 and d1.platform = 'ios' )
            OR  (d1.rank <= 4000 and d2.rank<=4000 and d1.store_id == 1000 and d1.platform = 'android' ) 
            OR (d1.rank <= 1000 and d2.rank<=1000 and  d1.store_id != 1000 and d1.platform = 'android' )

      ) AS t
    WHERE
      feed IN (
        0,
        1,
        2,
        101,
        100,
        102
      )
    GROUP BY
      id,
      store_id,
      category_id,
      platform_id,
      platform,
      vertical,
      feed
  ) 
     );

      
     WITH replace_metric AS (
     SELECT * ,
         case 
        when feed='0' and platform='ios' then 'free_app_download'
        when feed='1' and platform='ios' then 'paid_app_download'
        when feed='2' and platform='ios' then 'revenue' 
        when feed='101' and platform='ios' then 'free_app_download' 
        when feed='100' and platform='ios' then 'paid_app_download' 
        when feed='102' and platform='ios' then 'revenue' 
        when feed='0' and platform='android' then 'free_app_download' 
        when feed='1' and platform='android' then 'paid_app_download' 
        when feed='2' and platform='android' then 'revenue' 
        end as metric from filter_top_N_raw_data);
      
      
         WITH replace_metric_device_code AS (
        SELECT * ,
         case 
        when feed='0' and platform='ios' then 'ios-phone'
        when feed='1' and platform='ios' then 'ios-phone'
        when feed='2' and platform='ios' then 'ios-phone' 
        when feed='101' and platform='ios' then 'ios-tablet' 
        when feed='100' and platform='ios' then 'ios-tablet' 
        when feed='102' and platform='ios' then 'ios-tablet' 
        when feed='0' and platform='android' then 'android-all' 
        when feed='1' and platform='android' then 'android-all' 
        when feed='2' and platform='android' then 'android-all' 
        end as device_code from replace_metric);


    WITH group_by_metric_1 AS (
        SELECT max(est) as est, id, metric,device_code, store_id, platform, category_id from replace_metric_device_code where store_id not in (3,4,5,6) and device_code in ('ios-phone' ,'ios-tablet' ) and feed not in (1000, 1001, 1002) group by id, store_id, metric,device_code, platform, category_id
        );
        
    WITH group_by_metric_2 AS (
        SELECT max(est) as est, id, metric,device_code, store_id, platform, category_id from replace_metric_device_code where store_id not in ( 1003, 1005, 1006,1007) and device_code='android-all' and feed not in (1000, 1001, 1002) group by id, store_id, metric,device_code, platform, category_id
    );
    
    WITH group_by_metric AS(
        SELECT * FROM group_by_metric_1
        UNION ALL
        SELECT * FROM group_by_metric_2
        );

      -- pivot metric column
    WITH pivot_metric_raw AS (

    SELECT 
        distinct id as app_id, store_id, platform, device_code, free_app_download,revenue, paid_app_download, category_id as category_id_pivot
    FROM
          group_by_metric
     PIVOT (
        max(est) 
    	FOR metric IN ('free_app_download','revenue', 'paid_app_download')
      )
    );
    
    -- map raw with category
    WITH category_mapping_raw AS (

    SELECT * from 
        ( select *, 'ios' as mapping_platform from category_mapping_deminsion_service where market_code='apple-store' 
    UNION ALL select *, 'android' as mapping_platform from category_mapping_deminsion_service where market_code='google-play'
     ) as mapping 
    FULL OUTER JOIN pivot_metric_raw 
    ON 
     mapping.legacy_category_id=pivot_metric_raw.category_id_pivot 
    AND 
     mapping.mapping_platform=pivot_metric_raw.platform
    );
    
    
    
    -- union all platform with country_code mapping

    WITH country_code_mapping AS (
    select *, 'android' as market_code from android_country_mapping 
    UNION ALL select *, 'ios' market_code from ios_country_mapping
    UNION ALL select 143502, 'VE', 'VESA', 'ios'
    UNION ALL select 0, 'WW', 'worldwide', 'ios'
    UNION ALL select 36, 'CZ', 'CZ', 'android'
    UNION ALL select 5, 'ES', 'ES', 'android'

    );





    WITH country_category_mapping_raw AS (
    select app_id, country_code, device_code, free_app_download, paid_app_download, revenue, category_id 
     from country_code_mapping 
     inner join 
         category_mapping_raw 
     on 
         country_code_mapping.store_id=category_mapping_raw.store_id 
     and 
         country_code_mapping.market_code=category_mapping_raw.platform
    where country_name!='Global'
    );




    
      """
    
    # store_unified , rank_unified
    namespace = "aa.store.market-size.v1"
    ingest_msg = {
        "namespace": "aa.store.market-size.v1",
        "job_type": "routine",
        "options": {},
        "source": [
            # {
            #     "data_encoding": "parquet",
            #     "compression": "gzip",
            #     "name": "store_unified_weekly_data",
            #     "path": [
            #         "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category.v3/fact/granularity=weekly/date=2020-03-28/"],
            #     # "path": est_list,

            # }, 
            {
            "data_encoding": "csv",
            "compression": "gzip",
            "name":"ios_country_mapping",
            "data_schema": [
                            {"name":"store_id","type":"int","nullable": False},
                            {"name":"country_code","type":"string","nullable": False},
                            {"name":"country_name","type":"string","nullable": False}
                            ],
             "csv_options": {
           'header': True,
           'sep': '\t',
           'quote': '',
           'encoding': 'utf-8',
           'escape': ''
           },

            "path": ["s3://b2c-prod-dca-store-estimates/store_back/dimension/IOS_COUNTRY_MAPPING"],
        },   
        {
            "data_encoding": "csv",
            "compression": "gzip",
            "name":"android_country_mapping",
            "data_schema": [
                            {"name":"store_id","type":"int","nullable": False},
                            {"name":"country_code","type":"string","nullable": False},
                            {"name":"country_name","type":"string","nullable": False}
                            ],
             "csv_options": {
           'header': True,
           'sep': '\t',
           'quote': '',
           'encoding': 'utf-8',
           'escape': ''
           },

            "path": ["s3://b2c-prod-dca-store-estimates/store_back/dimension/ANDROID_COUNTRY_MAPPING"],
        },
        {
            "data_encoding": "parquet",
            "compression": "gzip",
            "name":"category_mapping_deminsion_service",
            "path": ["s3://b2c-prod-data-pipeline-qa/aa.store/store_cateogry_mapping"],
        }
        ]
    }
    
    run(spark, ingest_msg, sql_text)
    
    # to_country_code = udf(lambda x, y : id_to_country_code(x,y), StringType())

    # df_raw = spark.sql("select * from pivot_metric_raw").select("app_id", to_country_code("platform","country_code").alias("country_code"), "platform", "device_code", "free_app_download","revenue", "paid_app_download").cache()
    # df_raw.createOrReplaceTempView("raw_weekly_data")
    # spark.sql("select * from country_category_mapping_raw where app_id=1464081043 and country_code='AU'").show()
    # print spark.sql("select * from store_unified_weekly_data").take(2)
    # spark.sql("select * from compare_data_raw where app_id is not null except all select * from compare_data_unified where app_id is not null").show()
    # spark.sql("select * from compare_data_unified except all select * from compare_data_raw").show()
    # count_1 = spark.sql("select count(*) from compare_data_raw where app_id is not null").take(1)
    # count_2 = spark.sql("select count(*) from compare_data_unified ").take(1)
    # if count_1[0][0] != count_2[0][0]:
    #     print 'failed!!!!!!!!!!!!!'
    spark.sql("select * from country_category_mapping_raw except all select app_id, country_code, device_code, free_app_download, paid_app_download, revenue, category_id from unified_category_weekly ").show()
    spark.sql("select app_id, country_code, device_code, free_app_download, paid_app_download, revenue, category_id from unified_category_weekly  except all select * from country_category_mapping_raw").show()
    eject_all_caches(spark)

    

    
sc.parallelize(map(test_weekly_data, sar_list), 1)

In [0]:

spark.sql("select * from country_category_mapping_raw except all select app_id, country_code, device_code, free_app_download, paid_app_download, revenue, category_id from unified_category_weekly ").show()
spark.sql("select app_id, country_code, device_code, free_app_download, paid_app_download, revenue, category_id from unified_category_weekly  except all select * from country_category_mapping_raw").show()

In [0]:

spark.sql("select * from country_category_mapping_raw where app_id = 20600001302870 and country_code='WW' and device_code='android-all' ").show() ## 我算出来哒
spark.sql("select * from unified_category_weekly where app_id = 20600001302870 and country_code='WW'  and device_code='android-all'").show()  ## 你算出来哒  "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category.v3/fact/granularity=weekly/date=2020-03-28/"],


In [0]:

spark.sql("select * from daily_data where id=20600001302870 and store_id=1000 and feed = 2 and rank<=4000 ").show(200)  ## raw data

In [0]:

spark.sql("select * from filter_top_N_raw_data where id=971304016 and store_id=143489").show(200)

In [0]:


import datetime
from pyspark.sql import types as T
from pyspark.sql import functions as F


start_week = "2020-03-22"
end_week = "2020-03-29"

real_date1 = datetime.date(*[int(x) for x in start_week.split('-')])
real_date2 = datetime.date(*[int(x) for x in end_week.split('-')])
date_range = real_date2 - real_date1
dates = list()
sar_list=list()
for days in xrange(date_range.days):
    dates.append(str(real_date1 + datetime.timedelta(days)))

dates_temp=list()
sar_list=list()
for days in xrange(date_range.days):
    dates_temp.append(real_date1 + datetime.timedelta(days))
    if (real_date1 + datetime.timedelta(days)).weekday() == 5:
        temp=list()
        while dates_temp:
            temp.append(dates_temp.pop())
        sar_list.append(str(real_date1 + datetime.timedelta(days)))




print 'test date'
print dates
print sar_list
 


df_1 = spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/").parquet("s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/date={%s}/platform=ios/*/" %  ",".join(dates)).cache()

# df_2 = spark.read.option("basePath", "s3://b2c-prod-dca-store-estimates/store_est/v_final/MONTH/").schema(monthly_csv_schema).csv("s3://b2c-prod-dca-store-estimates/store_est/v_final/MONTH/{%s}/ios/sbe_est_app/*/" % dates[-1], sep="\t").cache()
# df_3 = spark.read.option("basePath", "s3://b2c-prod-dca-store-estimates/store_est/v_final/WEEK/").schema(weekly_csv_schema).csv("s3://b2c-prod-dca-store-estimates/store_est/v_final/WEEK/{%s}/ios/sbe_est_app/*/" % ",".join(sar_list), sep="\t").cache()

df_1.createOrReplaceTempView("daily_data")
# df_2.createOrReplaceTempView("monthly_data")
# df_3.createOrReplaceTempView("weekly_data")



In [0]:

spark.sql('''
    SELECT
      id,
      Sum(est) AS est,
      store_id,
      platform_id,
      feed,
      vertical
    FROM
      (
        SELECT
          DISTINCT d1.id,
          d1.est,
          d1.store_id,
          d1.date,
          d1.feed,
          d1.vertical,
          d1.platform_id
        FROM
          daily_data AS d1
          JOIN daily_data AS d2 
          ON d1.id = d2.id
          AND d1.store_id = d2.store_id
          AND d1.feed = d2.feed
          AND d1.vertical = d2.vertical
          AND d1.platform_id = d2.platform_id
        WHERE (d1.rank <= 4000 and d2.rank<=4000 and d1.store_id == 0 and d1.platform = 'ios' ) 
            OR (d1.rank <= 1000 and d2.rank<=1000 and  d1.store_id != 0 and d1.platform = 'ios' )
      ) AS t
    WHERE
      feed IN (
        0,
        1,
        2,
        101,
        100,
        102
      )
    GROUP BY
      id,
      store_id,
      platform_id,
      vertical,
      feed
  ''').create
  
  
spark.sql('''select *, 
    case 
        when feed='0' and platform='ios' then 'free_app_download'
        when feed='1' and platform='ios' then 'paid_app_download'
        when feed='2' and platform='ios' then 'revenue' 
        when feed='3' and platform='ios' then 'revenue_iap' 
        when feed='4' and platform='ios' then 'revenue_non_iap' 
        when feed='101' and platform='ios' then 'free_app_download' 
        when feed='100' and platform='ios' then 'paid_app_download' 
        when feed='102' and platform='ios' then 'revenue' 
        when feed='103' and platform='ios' then 'revenue_iap' 
        when feed='104' and platform='ios' then 'revenue_non_iap' 
        when feed='1000' and platform='ios' then 'free_app_download' 
        when feed='1001' and platform='ios' then 'paid_app_download' 
        when feed='1002' and platform='ios' then 'revenue' 
        when feed='0' and platform='android' then 'free_app_download' 
        when feed='1' and platform='android' then 'paid_app_download' 
        when feed='2' and platform='android' then 'revenue' 
        when feed='3' and platform='android' then 'revenue_iap' 
        when feed='4' and platform='android' then 'revenue_non_iap' 
        end as metric from raw_map_data_df''')

In [0]:

new_transfor_df = spark.sql('''
SELECT
  *
FROM
  (
    SELECT
      id,
      Sum(est) AS est,
      store_id,
      platform_id,
      feed,
      vertical
    FROM
      (
        SELECT
          DISTINCT d1.id,
          d1.est,
          d1.store_id,
          d1.date,
          d1.feed,
          d1.vertical,
          d1.platform_id
        FROM
          daily_data AS d1
          JOIN daily_data AS d2 
          ON d1.id = d2.id
          AND d1.store_id = d2.store_id
          AND d1.feed = d2.feed
          AND d1.vertical = d2.vertical
          AND d1.platform_id = d2.platform_id
        WHERE (d1.rank <= 4000 and d2.rank<=4000 and d1.store_id == 0 and d1.platform = 'ios' ) 
            OR (d1.rank <= 1000 and d2.rank<=1000 and  d1.store_id != 0 and d1.platform = 'ios' )
      ) AS t
    WHERE
      feed IN (
        0,
        1,
        2,
        101,
        100,
        102
      )
    GROUP BY
      id,
      store_id,
      platform_id,
      vertical,
      feed
  ) PIVOT (
    max(est) FOR feed IN (
      0,
      1,
      2,
      101,
      100,
      102
    )
  )
''').withColumnRenamed("0", platform_feed_to_metric('ios',0)).withColumnRenamed("1", platform_feed_to_metric('ios',1)).withColumnRenamed("2", platform_feed_to_metric('ios',2)).withColumnRenamed("101", platform_feed_to_metric('ios',101)).withColumnRenamed("100", platform_feed_to_metric('ios',100)).withColumnRenamed("102", platform_feed_to_metric('ios',102)).na.fill(0).cache()
                                                        
new_transfor_df.createOrReplaceTempView("transfer_category_est")

spark.sql("select * from transfer_new").show()


In [0]:

import datetime
from pyspark.sql import types as T
from pyspark.sql import functions as F

from aadatapipelinecore.core.urn import Urn
from aadatapipelinecore.core.pipeline import type_
from applications.common.parser import SqlParser
from applications.common.executor import SqlExecutor
from applications.auto_pipeline.transform import _view
from aadatapipelinecore.core.utils.spark import eject_all_caches

spark.sparkContext.addPyFile("/home/hadoop/bdp/application/libs/python/dependencies.zip")
import aaplproxy


start = "2020-03-08"
end = "2020-03-15"
# end = "2012-05-01"
real_date1 = datetime.date(*[int(x) for x in start.split('-')])
real_date2 = datetime.date(*[int(x) for x in end.split('-')])
date_range = real_date2 - real_date1
dates = list()
sar_list=list()
for days in xrange(date_range.days):
    dates.append(real_date1 + datetime.timedelta(days))
    if (real_date1 + datetime.timedelta(days)).weekday() == 5:
        temp=list()
        while dates:
            temp.append(dates.pop())
        sar_list.append({real_date1 + datetime.timedelta(days):temp})

test_path=list()


        
for x in sar_list:
    for key,item in x.items():
        test_path.append(
            (
                ["s3://b2c-mktint-prod-dca-kpi/download_attribution/week_and_month_routine/v1.0.0/WEEK/{}/*/".format(key)] , 
                [i.strftime("%Y-%m-%d") for i in item], 
                ["s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date={}".format(i) for i in item ]
            )
        )
# print test_path



class DryRunSqlExecutor(SqlExecutor):
    def _verify_tasks(self):
        pass


def run(spark, raw_data, sql_text, dry_run=True):
    urn = Urn(namespace=raw_data["namespace"])
    source_data_list = raw_data.pop("source")
    raw_data.update(raw_data.pop("options"))
    _view(spark, sql_text, None, source_data_list)
    context = raw_data
    tasks = SqlParser(spark, sql_text, context).parse()
    if dry_run:
        sql_executor = DryRunSqlExecutor
    else:
        sql_executor = SqlExecutor
    sql_executor(urn, spark, tasks, type_.EventType.TRANSFORM, context).run()





def test_weekly_data(test_data):
    print 'test_date[0]', test_data[0]
    print 'test_data[1]', test_data[1]
    df_1 = spark.read.option("basePath", "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-rank.v1/").parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-rank.v1/fact/granularity=daily/date={%s}/" % ",".join(test_data[1]))
    df_2 = spark.read.option("basePath", "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/").parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date={%s}/" % ",".join(test_data[1]))
    df_3 = spark.read.option("basePath", "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-rank.v1/").parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-rank.v3/fact/granularity=weekly/date=%s/" % test_data[0])

    df_1.createOrReplaceTempView("daily_rank")
    df_2.createOrReplaceTempView("daily_est")
    df_3.createOrReplaceTempView("weekly_rank")

    sql_text = """
    
    -- rank_unified,store_unified
    WITH unified_data_test AS 
    ( 
                    SELECT          store_unified.country_code, 
                                    store_unified.device_code, 
                                    store_unified.free_app_download AS est_free_app_download , 
                                    store_unified.paid_app_download AS est_paid_app_download, 
                                    store_unified.revenue           AS est_revenue, 
                                    store_unified.revenue_iap       AS est_revenue_iap, 
                                    store_unified.revenue_non_iap   AS est_revenue_non_iap, 
                                    rank_unified.category_id, 
                                    rank_unified.app_id, 
                                    rank_unified.free_app_download, 
                                    rank_unified.paid_app_download, 
                                    rank_unified.revenue, 
                                    rank_unified.revenue_iap, 
                                    rank_unified.revenue_non_iap, 
                                    rank_unified.granularity, 
                                    rank_unified.date 
                    FROM            rank_unified 
                    FULL OUTER JOIN store_unified 
                    ON              rank_unified.app_id = store_unified.app_id 
                    AND             rank_unified.country_code = store_unified.country_code 
                    AND             rank_unified.device_code = store_unified.device_code 
                    AND             rank_unified.date = store_unified.date );



    WITH unified_rank_filter_data_free_app_download AS 
    ( 
           SELECT * 
           FROM   unified_data_test 
           WHERE ( ( ( 
                                free_app_download<=1000 
                         AND    country_code!="WW" ) 
                  OR     ( 
                                free_app_download<=4000 
                         AND    country_code=="WW" ) ) 
           OR     ( ( 
                                paid_app_download<=1000 
                         AND    country_code!="WW" ) 
                  OR     ( 
                                paid_app_download<=4000 
                         AND    country_code=="WW" ) ) 
           OR     ( ( 
                                revenue<=1000 
                         AND    country_code!="WW" ) 
                  OR     ( 
                                revenue<=4000 
                         AND    country_code=="WW" ) ) )
           AND    device_code!='ios-all'
    );



           WITH unified_category_filter_data_free_app_download AS 
    ( 
           SELECT * ,
           CASE WHEN (free_app_download > 1000 and country_code !='WW') or (free_app_download > 4000 and country_code =='WW' ) or (free_app_download is null or free_app_download <= 0) Then null else est_free_app_download END as est_free_app_download_category,
           CASE WHEN (paid_app_download > 1000 and country_code !='WW') or (paid_app_download > 4000 and country_code =='WW' ) or (paid_app_download is null or paid_app_download <= 0) Then null else est_paid_app_download END as est_paid_app_download_category,
           CASE WHEN (revenue > 1000 and country_code !='WW') or (revenue > 4000 and country_code =='WW') or (revenue is null or revenue <= 0) Then null else est_revenue  END as est_revenue_category
           FROM   unified_rank_filter_data_free_app_download 


    );    

    

    

    """
    
    # store_unified , rank_unified
    namespace = "aa.store.market-size.v1"
    ingest_msg = {
        "namespace": "aa.store.market-size.v1",
        "job_type": "routine",
        "options": {},
        "source": [
            {
                "data_encoding": "parquet",
                "compression": "gzip",
                "name": "store_unified",
                "path": [
                    "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date={}".format(
                        test_date)],
                # "path": est_list,

            }, {
                "data_encoding": "parquet",
                "compression": "gzip",
                "name": "rank_unified",
                "path": [
                    "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-rank.v1/fact/granularity=daily/date={}".format(
                        test_date)],
                # "path": rank_list,

            }
        ]
    }
    
    run(spark, ingest_msg, sql_text)
    
    # spark.sql("select * from download_attribution where product_id=284035177 and country_code='ww'").show()
    # spark.sql("select * from compare_data_unified where device_code='ios-tablet' ").show()
    # spark.sql("select * from compare_data_raw").show()

    # spark.sql("select * from caculate_data where app_id=20600000009072 and unified_country_code='WW' and unified_device_code='android-all'").show()
    spark.sql("select * from compare_data_raw where app_id is not null except all select * from compare_data_unified where app_id is not null").show()
    spark.sql("select * from compare_data_unified except all select * from compare_data_raw").show()
    count_1 = spark.sql("select count(*) from compare_data_raw where app_id is not null").take(1)
    count_2 = spark.sql("select count(*) from compare_data_unified ").take(1)
    if count_1[0][0] != count_2[0][0]:
        print 'failed!!!!!!!!!!!!!'

    # spark.sql("select * from compare_data_unified_add_to_est where diff !=0  ").show()
    eject_all_caches(spark)

    

    
sc.parallelize(map(test_download_attribution, test_path), 1)

In [0]:

print test_path

In [0]:

spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category.v3/fact/granularity=weekly/date=2020-05-02/").createOrReplaceTempView("test")
# s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category.v3/fact/granularity=monthly/date=2020-04-30/

spark.sql("select * from test")

In [0]:

from pyspark.sql import functions as F
df = spark.createDataFrame([([1, 20, 3, 5],), ([1, 20, None, 3],), ([6, 20, None, 3],)], ['data'])
df.show()
df.select(F.shuffle(df.data).alias('s')).collect()

In [0]:

from pyspark.sql import functions as F
import datetime

start_week = "2020-04-12"
end_week = "2020-04-19"
real_date1 = datetime.date(*[int(x) for x in start_week.split('-')])
real_date2 = datetime.date(*[int(x) for x in end_week.split('-')])
date_range = real_date2 - real_date1
dates = list()
sar_list=list()
for days in xrange(date_range.days):
    dates.append(real_date1 + datetime.timedelta(days))
    if (real_date1 + datetime.timedelta(days)).weekday() == 5:
        temp=list()
        while dates:
            temp.append(dates.pop())
        sar_list.append({real_date1 + datetime.timedelta(days):temp})

test_path=list()
for x in sar_list:
    for key,item in x.items():
        test_path.append((key.strftime("%Y-%m-%d"),[d.strftime("%Y-%m-%d") for d in item]))
        
        
        
def check_diff(weekly_data):
    print 'weekly_data[0]', weekly_data[0]
    print 'weekly_data[1]', weekly_data[1]
    df_1 = spark.read.option("basePath", "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-rank.v1/").parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-rank.v1/fact/granularity=daily/date={%s}/" % ",".join(weekly_data[1]))
    df_2 = spark.read.option("basePath", "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/").parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date={%s}/" % ",".join(weekly_data[1]))
    df_3 = spark.read.option("basePath", "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-rank.v1/").parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-rank.v1/fact/granularity=weekly/date=%s/" % weekly_data[0])

    df_1.createOrReplaceTempView("daily_rank")
    df_2.createOrReplaceTempView("daily_est")
    df_3.createOrReplaceTempView("weekly_rank")

    spark.sql('''select daily_est.date, daily_est.free_app_download, daily_est.paid_app_download, daily_est.revenue, daily_est.app_id, daily_est.device_code, daily_est.country_code, daily_rank.category_id from daily_rank 
                full outer join daily_est 
                on daily_rank.country_code= daily_est.country_code
                and daily_rank.device_code=daily_est.device_code
                and daily_rank.app_id = daily_est.app_id 
                and daily_rank.date = daily_est.date''').createOrReplaceTempView("daily_est_rank_join")


    spark.sql("select app_id, country_code, device_code, category_id, sum(free_app_download) as free_app_download, sum(paid_app_download) as paid_app_download, sum(revenue) as revenue from daily_est_rank_join group by country_code, device_code, category_id, app_id ").createOrReplaceTempView("sum_daily")
        
    spark.sql(''' select *, 
                    ROW_NUMBER() OVER (PARTITION BY device_code, country_code, category_id ORDER BY free_app_download DESC, app_id DESC) free_app_download_rank,
                    ROW_NUMBER() OVER (PARTITION BY device_code, country_code, category_id ORDER BY paid_app_download DESC, app_id DESC) paid_app_download_rank,
                    ROW_NUMBER() OVER (PARTITION BY device_code, country_code, category_id ORDER BY revenue DESC, app_id DESC) revenue_rank
                    from sum_daily 
                    ''').createOrReplaceTempView("sum_rank_daily")
                        
    # spark.sql('''select app_id, country_code, free_app_download, device_code, category_id 
    #                 from weekly_rank 
    #                 where country_code='US' 
    #                     and device_code='ios-all' 
    #                     and category_id=100012  
    #                     and free_app_download is not null 
    #                     order by free_app_download desc''').createOrReplaceTempView("weekly_rank_1")

    # spark.sql("select * from sum_rank_daily where country_code='US' and category_id=100000 and device_code='ios-phone' order by free_app_download desc").show(1000)
    spark.sql("select * from sum_rank_daily where country_code='US' and category_id=100022 and device_code='ios-phone' order by paid_app_download desc").show(1005)
    # spark.sql("select * from sum_rank_daily where country_code='US' and category_id=100000 and device_code='ios-phone' order by revenue desc").show(1000)

sc.parallelize(map(check_diff, test_path), 3)



In [0]:

1368840 - 1617652


In [0]:

import datetime
start = "2010-07-04"
end = "2010-07-31"
# end = "2012-05-01"
real_date1 = datetime.date(*[int(x) for x in start.split('-')])
real_date2 = datetime.date(*[int(x) for x in end.split('-')])
date_range = real_date2 - real_date1
dates = list()

month_day=list()
for days in xrange(date_range.days):
    month_day.append(real_date1 + datetime.timedelta(days))


test_list= sorted(list(set([ d.strftime("%Y-%m-%d")[:7] for d in month_day ])))



In [0]:

from pyspark.sql import functions as F
import datetime

start_week = "2010-07-04"
end_week = "2010-07-12"
# end = "2012-05-01"
real_date1 = datetime.date(*[int(x) for x in start_week.split('-')])
real_date2 = datetime.date(*[int(x) for x in end_week.split('-')])
date_range = real_date2 - real_date1
dates = list()
sar_list=list()
for days in xrange(date_range.days):
    dates.append(real_date1 + datetime.timedelta(days))
    if (real_date1 + datetime.timedelta(days)).weekday() == 5:
        temp=list()
        while dates:
            temp.append(dates.pop())
        sar_list.append({real_date1 + datetime.timedelta(days):temp})

test_path=list()
for x in sar_list:
    for key,item in x.items():
        test_path.append((key.strftime("%Y-%m-%d"),[d.strftime("%Y-%m-%d") for d in item]))
        
        
        
def check_diff(weekly_data):
    df_1 = spark.read.option("basePath", "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-rank.v1/").parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-rank.v1/fact/granularity=daily/date={%s}/" % ",".join(weekly_data[1]))
    df_2 = spark.read.option("basePath", "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/").parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date={%s}/" % ",".join(weekly_data[1]))
    df_3 = spark.read.option("basePath", "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-rank.v1/").parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-rank.v1/fact/granularity=weekly/date=%s/" % weekly_data[0])

    df_1.createOrReplaceTempView("daily_rank")
    df_2.createOrReplaceTempView("daily_est")
    df_3.createOrReplaceTempView("weekly")
    
    print "compare app id: " , weekly_data
    # spark.sql("select distinct app_id from monthly except all select distinct app_id from  daily ").show(2)
    # spark.sql("select distinct app_id from daily except all select distinct app_id from  monthly ").show(2)
    spark.sql('''select daily_est.free_app_download, daily_est.paid_app_download, daily_est.revenue, daily_est.app_id, daily_est.device_code, daily_est.country_code, daily_rank.category_id from daily_rank 
                join daily_est 
                on daily_rank.country_code= daily_est.country_code
                and daily_rank.device_code=daily_est.device_code
                and daily_rank.app_id = daily_est.app_id 
                and daily_rank.date = daily_est.date ''').createOrReplaceTempView("join_category_daily")
                
                # sum(paid_app_download) as paid_app_download_count , sum(revenue) as revenue_count, 
    spark.sql("select app_id, country_code, sum(free_app_download) as free_app_download_count, device_code, category_id from join_category_daily group by country_code,device_code, category_id, app_id ").createOrReplaceTempView("sum_daily")

    spark.sql("select country_code, device_code, category_id, count(*) as total_daily_count from sum_daily group by country_code,device_code, category_id ").createOrReplaceTempView("count_daily")
    spark.sql("select app_id, country_code,device_code, category_id, free_app_download_count from sum_daily where free_app_download_count is not null ").createOrReplaceTempView("test_sum_daily")
    spark.sql('''select * from
                    (select app_id, weekly.country_code, weekly.device_code, weekly.category_id, free_app_download, total_daily_count from weekly 
                     join count_daily on
                    weekly.country_code=count_daily.country_code
                    and weekly.device_code=count_daily.device_code
                    and weekly.category_id=count_daily.category_id) as prod where free_app_download<=total_daily_count ''').createOrReplaceTempView("unified_weekly_data")

    spark.sql('''select app_id from unified_weekly_data except all select app_id from test_sum_daily ''').show()
    spark.sql('''select app_id from test_sum_daily except all select app_id from test_sum_daily ''').show()

    # spark.sql("select * from weekly")
    # spark.sql("select app_id, country_code, free_app_download, device_code ,category_id left join sum_daily ")
    # spark.sql("select app_id, country_code, device_code ,category_id from sum_daily where free_app_download is not null").createOrReplaceTempView("daily_free_app")
    # total_count = spark.sql("select count(*) from daily_free_app").take(1)
    # print total_count
    # spark.sql("select app_id, country_code, device_code ,category_id from sum_daily where paid_app_download is not null").createOrReplaceTempView("daily_paid_app")
    # spark.sql("select app_id, country_code, device_code ,category_id from sum_daily where revenue is not null").createOrReplaceTempView("daily_revenue")



    # spark.sql("select count(*) from daily_free_app").take(1)

    # spark.sql("select app_id, country_code, device_code ,category_id from daily_free_app except all select app_id, country_code, device_code ,category_id from weekly where free_app_download<= {} ".format(total_count[0][0])).show()
    # spark.sql("select app_id, country_code, device_code ,category_id from weekly where free_app_download<={}  except all select app_id, country_code, device_code ,category_id from daily_free_app ".format(total_count[0][0])).show()

    # spark.sql("select app_id,country_code,free_app_download,paid_app_download,revenue,device_code,category_id from sum_daily except all select app_id,country_code,free_app_download,paid_app_download,revenue,device_code,category_id from monthly").show(2)
    # spark.sql("select app_id,country_code,free_app_download,paid_app_download,revenue,device_code,category_id from monthly except all select app_id,country_code,free_app_download,paid_app_download,revenue,device_code,category_id from sum_daily").show(2)

    # except_1 = spark.sql("select app_id, country_code,free_app_download,device_code,category_id from order_sum_daily_free_download except all select app_id, country_code,free_app_download,device_code,category_id from weekly").withColumn("date", F.lit(weekly_data[0]) ).withColumn("type",F.lit("daily_weekly"))
    # except_2 = spark.sql("select app_id, country_code,paid_app_download,device_code,category_id from order_sum_daily_paid_download except all select app_id, country_code,paid_app_download,device_code,category_id from weekly").withColumn("date", F.lit(weekly_data[0]) ).withColumn("type",F.lit("daily_weekly"))
    # except_3 = spark.sql("select app_id, country_code,revenue,device_code,category_id from order_sum_daily_revenue except all select app_id, country_code,revenue,device_code,category_id from weekly").withColumn("date", F.lit(weekly_data[0]) ).withColumn("type",F.lit("daily_weekly"))

    # except_1.show(30)
    # except_2.show()
    # except_3.show()
    
    # df_write_result = except_2.union(except_1).cache()
    # df_write_result.createOrReplaceTempView("df_write_result")
    #if df_write_result.rdd.isEmpty():
    #    print 'pass'
#    else:
 #       print 'failed!!!!!!!' , month

    # from aadatapipelinecore.core.utils.retry import retry
    # def write_test_result(df_write_result):
    #     df_write_result.write.format("delta").save("s3://b2c-prod-data-pipeline-qa/aa.store/result_store_unified_monthly_category_count/",
    #                                       mode="append",
    #                                       partitionBy=["date"])
    # retry(write_test_result,(df_write_result,),{},interval=10)

sc.parallelize(map(check_diff, test_path), 1)

In [0]:

from pyspark.sql import functions as F
import datetime

start_week = "2010-07-04"
end_week = "2010-07-12"
# end = "2012-05-01"
real_date1 = datetime.date(*[int(x) for x in start_week.split('-')])
real_date2 = datetime.date(*[int(x) for x in end_week.split('-')])
date_range = real_date2 - real_date1
dates = list()
sar_list=list()
for days in xrange(date_range.days):
    dates.append(real_date1 + datetime.timedelta(days))
    if (real_date1 + datetime.timedelta(days)).weekday() == 5:
        temp=list()
        while dates:
            temp.append(dates.pop())
        sar_list.append({real_date1 + datetime.timedelta(days):temp})

test_path=list()
for x in sar_list:
    for key,item in x.items():
        test_path.append((key.strftime("%Y-%m-%d"),[d.strftime("%Y-%m-%d") for d in item]))
        
        
        
def check_diff(weekly_data):
    df_1 = spark.read.option("basePath", "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-rank.v1/").parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-rank.v1/fact/granularity=daily/date={%s}/" % ",".join(weekly_data[1]))
    df_2 = spark.read.option("basePath", "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/").parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date={%s}/" % ",".join(weekly_data[1]))
    df_3 = spark.read.option("basePath", "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-rank.v1/").parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-rank.v1/fact/granularity=weekly/date=%s/" % weekly_data[0])
    df_4 = spark.read.option("basePath", "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/").parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=weekly/date=%s/" % weekly_data[0])

    df_1.createOrReplaceTempView("daily_rank")
    df_2.createOrReplaceTempView("daily_est")
    df_3.createOrReplaceTempView("weekly_rank")
    df_3.createOrReplaceTempView("weekly_est")
    
    print "compare app id: " , weekly_data
    # spark.sql("select distinct app_id from monthly except all select distinct app_id from  daily ").show(2)
    # spark.sql("select distinct app_id from daily except all select distinct app_id from  monthly ").show(2)
    spark.sql('''select daily_est.free_app_download, daily_est.paid_app_download, daily_est.revenue, daily_est.app_id, daily_est.device_code, daily_est.country_code, daily_rank.category_id from daily_rank 
                join daily_est 
                on daily_rank.country_code= daily_est.country_code
                and daily_rank.device_code=daily_est.device_code
                and daily_rank.app_id = daily_est.app_id 
                and daily_rank.date = daily_est.date ''').createOrReplaceTempView("sum_category_daily")

    spark.sql("select app_id, country_code, sum(free_app_download) as free_app_download , sum(paid_app_download) as paid_app_download , sum(revenue) as revenue, device_code ,category_id from sum_category_daily group by country_code,device_code, category_id, app_id ").createOrReplaceTempView("sum_daily")
        
    spark.sql("select app_id, country_code, free_app_download, device_code, category_id from sum_daily where free_app_download is not null").createOrReplaceTempView("sum_daily_free_download")
    # spark.sql("select app_id, country_code, paid_app_download, device_code, category_id from sum_daily where paid_app_download is not null ").createOrReplaceTempView("sum_daily_paid_app_download")
    # spark.sql("select app_id, country_code, revenue , device_code ,category_id from sum_daily where revenue is not null ").createOrReplaceTempView("sum_daily_revenue")


    spark.sql('''select app_id, country_code, device_code, category_id, free_app_download, ROW_NUMBER() OVER (PARTITION BY device_code, country_code, category_id ORDER BY free_app_download DESC) free_app_download_rank from sum_daily_free_download    ''').createOrReplaceTempView("order_sum_daily_free_download_rank")


    


    
    # spark.sql('''select app_id, country_code, device_code, category_id, free_app_download, dense_rank() OVER (PARTITION BY device_code, country_code, category_id ORDER BY free_app_download DESC) free_app_download_rank from sum_daily_free_download''').createOrReplaceTempView("order_sum_daily_free_download_dense_rank")
    
    # spark.sql('''select app_id, country_code, device_code, category_id, free_app_download, row_number() OVER (PARTITION BY device_code, country_code, category_id ORDER BY free_app_download DESC) free_app_download_rank from sum_daily_free_download''').createOrReplaceTempView("order_sum_daily_free_download_row_number")


    # spark.sql('''select app_id, country_code, device_code, category_id, ROW_NUMBER() OVER (PARTITION BY device_code, country_code, category_id ORDER BY paid_app_download DESC) paid_app_download from sum_daily_paid_app_download''').createOrReplaceTempView("order_sum_daily_paid_download")
    # spark.sql('''select app_id, country_code, device_code, category_id, ROW_NUMBER() OVER (PARTITION BY device_code, country_code, category_id ORDER BY revenue DESC) revenue from sum_daily_revenue''').createOrReplaceTempView("order_sum_daily_revenue")




    # spark.sql("select app_id, country_code, free_app_download , paid_app_download , revenue, device_code,category_id from daily group by country_code,device_code,app_id,category_id ").createOrReplaceTempView("sum_daily")
    # spark.sql("select app_id,country_code,free_app_download,paid_app_download,revenue,device_code,category_id from sum_daily except all select app_id,country_code,free_app_download,paid_app_download,revenue,device_code,category_id from monthly").show(2)
    # spark.sql("select app_id,country_code,free_app_download,paid_app_download,revenue,device_code,category_id from monthly except all select app_id,country_code,free_app_download,paid_app_download,revenue,device_code,category_id from sum_daily").show(2)

    spark.sql('''select count(*) from 
                    (select app_id, country_code,free_app_download,device_code,category_id from order_sum_daily_free_download_rank except all select app_id, country_code,free_app_download,device_code,category_id from weekly) as test''').withColumn("date", F.lit(weekly_data[0]) ).withColumn("type",F.lit("daily_weekly")).show()
    # except_2 = spark.sql("select app_id, country_code,paid_app_download,device_code,category_id from order_sum_daily_paid_download except all select app_id, country_code,paid_app_download,device_code,category_id from weekly").withColumn("date", F.lit(weekly_data[0]) ).withColumn("type",F.lit("daily_weekly"))
    # except_3 = spark.sql("select app_id, country_code,revenue,device_code,category_id from order_sum_daily_revenue except all select app_id, country_code,revenue,device_code,category_id from weekly").withColumn("date", F.lit(weekly_data[0]) ).withColumn("type",F.lit("daily_weekly"))

    # except_1.show(30)
    # except_2.show()
    # except_3.show()
    
    # df_write_result = except_2.union(except_1).cache()
    # df_write_result.createOrReplaceTempView("df_write_result")
    #if df_write_result.rdd.isEmpty():
    #    print 'pass'
#    else:
 #       print 'failed!!!!!!!' , month

    # from aadatapipelinecore.core.utils.retry import retry
    # def write_test_result(df_write_result):
    #     df_write_result.write.format("delta").save("s3://b2c-prod-data-pipeline-qa/aa.store/result_store_unified_monthly_category_count/",
    #                                       mode="append",
    #                                       partitionBy=["date"])
    # retry(write_test_result,(df_write_result,),{},interval=10)

sc.parallelize(map(check_diff, test_path), 1)

In [0]:

from pyspark.sql import functions as F
import datetime

start_week = "2020-01-12"
end_week = "2020-01-19"
# end = "2012-05-01"
real_date1 = datetime.date(*[int(x) for x in start_week.split('-')])
real_date2 = datetime.date(*[int(x) for x in end_week.split('-')])
date_range = real_date2 - real_date1
dates = list()
sar_list=list()
for days in xrange(date_range.days):
    dates.append(real_date1 + datetime.timedelta(days))
    if (real_date1 + datetime.timedelta(days)).weekday() == 5:
        temp=list()
        while dates:
            temp.append(dates.pop())
        sar_list.append({real_date1 + datetime.timedelta(days):temp})

test_path=list()
for x in sar_list:
    for key,item in x.items():
        test_path.append((key.strftime("%Y-%m-%d"),[d.strftime("%Y-%m-%d") for d in item]))
        
        
        
def check_diff(weekly_data):
    print 'weekly_data[0]', weekly_data[0]
    print 'weekly_data[1]', weekly_data[1]
    df_1 = spark.read.option("basePath", "s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/").parquet("s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/date={%s}" % ",".join(weekly_data[1]))
    df_2 = spark.read.option("basePath", "s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=WEEK/").parquet("s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=WEEK/date={%s}/platform=ios/" % weekly_data[0])

    df_1.createOrReplaceTempView("daily_data")
    df_2.createOrReplaceTempView("weekly_data")
    spark.sql("select * from daily_data").show(2)
    spark.sql("select * from weekly_data").show(2)

sc.parallelize(map(check_diff, test_path), 1)


In [0]:

test_df = spark.sql('''
SELECT * 
FROM   ( 
                SELECT   id, 
                         store_id, 
                         category_id, 
                         platform_id, 
                         vertical, 
                         feed, 
                         platform, 
                         Sum(est) AS est
                FROM     daily_data 
                WHERE    feed IN ( 0, 
                                  1, 
                                  2, 
                                  101, 
                                  100, 
                                  102 ) 
                AND      platform='ios' 
                GROUP BY id, 
                         store_id, 
                         category_id, 
                         platform_id, 
                         vertical, 
                         feed,
                         platform ) PIVOT ( max(est) FOR feed IN (0, 
                                                                  1, 
                                                                  2, 
                                                                  101, 
                                                                  100, 
                                                                  102) ) ''').withColumnRenamed("0", platform_feed_to_metric('ios',0)).withColumnRenamed("1", platform_feed_to_metric('ios',1)).withColumnRenamed("2", platform_feed_to_metric('ios',2)).withColumnRenamed("101", platform_feed_to_metric('ios',101)).withColumnRenamed("100", platform_feed_to_metric('ios',100)).withColumnRenamed("102", platform_feed_to_metric('ios',102)).na.fill(0).cache()
test_df.createOrReplaceTempView("transfer")



In [0]:


def platform_feed_to_metric(platform, feed):
    mapping = [
        ['ios', 0, 'iphone_free'],
        ['ios',1, 'iphone_paid'],
        ['ios',2, 'iphone_revenue'],
        ['ios',101,'ipad_free'],
        ['ios',100,'ipad_paid'],
        ['ios',102,'ipad_revenue'],
        ['android',0,'est_free_app_download'],
        ['android',1,'est_paid_app_download'],
        ['android',2,'est_revenue'],
    ]
    return [x for x in mapping if (x[0], x[1]) == (platform, feed)][0][2]
    
platform_feed_to_metric("ios",0)

In [0]:


# print spark.sql("select id from (select id, store_id, category_id, iphone_free,iphone_paid, iphone_revenue, ipad_free, ipad_paid, ipad_revenue from transfer except all select id, store_id, category_id, iphone_free,iphone_paid, iphone_revenue, ipad_free, ipad_paid, ipad_revenue from weekly_data where platform='ios' ) as t1  except select id from (select id, store_id, category_id, iphone_free,iphone_paid, iphone_revenue, ipad_free, ipad_paid, ipad_revenue from weekly_data where platform='ios' except all select id, store_id, category_id, iphone_free,iphone_paid, iphone_revenue, ipad_free, ipad_paid, ipad_revenue from transfer) as t2 ").show()



print spark.sql("select * from (select id, store_id, category_id, iphone_free,iphone_paid, iphone_revenue, ipad_free, ipad_paid, ipad_revenue from weekly_data where platform='ios' except all select id, store_id, category_id, iphone_free,iphone_paid, iphone_revenue, ipad_free, ipad_paid, ipad_revenue from transfer) as t2 except select * from (select id, store_id, category_id, iphone_free,iphone_paid, iphone_revenue, ipad_free, ipad_paid, ipad_revenue from transfer except all select id, store_id, category_id, iphone_free,iphone_paid, iphone_revenue, ipad_free, ipad_paid, ipad_revenue from weekly_data where platform='ios' ) as t1 ").show()


# print spark.sql("select count(*) from (select id, store_id, category_id, iphone_free,iphone_paid, iphone_revenue, ipad_free, ipad_paid, ipad_revenue from transfer except all select id, store_id, category_id, iphone_free,iphone_paid, iphone_revenue, ipad_free, ipad_paid, ipad_revenue from weekly_data where platform='ios' ) as t1 ").show()
# print spark.sql("select  count(*)  from (select id, store_id, category_id, iphone_free,iphone_paid, iphone_revenue, ipad_free, ipad_paid, ipad_revenue from weekly_data where platform='ios' except all select id, store_id, category_id, iphone_free,iphone_paid, iphone_revenue, ipad_free, ipad_paid, ipad_revenue from transfer) as t2 ").show()


In [0]:

# print spark.sql("select count(*) from weekly_data ").show()
# spark.sql("select * from daily_data where id=281747159 and store_id=143451 and category_id=7001 and feed=102").show()
# spark.sql("select * from weekly_data where id=281747159 and store_id=143451 and category_id=7001").show()

In [0]:

1166880.000/33192067.000



In [0]:


spark.read.option("basePath", "s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=WEEK/").parquet("s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=WEEK/date=2020-01-11/platform=ios/").where("category_id=36 and id=304878510 and store_id=0  and platform='ios'").show()




In [0]:
%%sh

aws s3 ls s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=WEEK/date=2020-01-11/platform=ios/

aws s3 ls s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/date=2020-01-11/

In [0]:

from pyspark.sql import functions as F
import datetime

start_week = "2010-07-04"
end_week = "2010-07-11"
# end = "2012-05-01"
real_date1 = datetime.date(*[int(x) for x in start_week.split('-')])
real_date2 = datetime.date(*[int(x) for x in end_week.split('-')])
date_range = real_date2 - real_date1
dates = list()
sar_list=list()
for days in xrange(date_range.days):
    dates.append(real_date1 + datetime.timedelta(days))
    if (real_date1 + datetime.timedelta(days)).weekday() == 5:
        temp=list()
        while dates:
            temp.append(dates.pop())
        sar_list.append({real_date1 + datetime.timedelta(days):temp})

test_path=list()
for x in sar_list:
    for key,item in x.items():
        test_path.append((key.strftime("%Y-%m-%d"),[d.strftime("%Y-%m-%d") for d in item]))
        
        
        
def check_diff(weekly_data):
    print 'weekly_data[0]', weekly_data[0]
    print 'weekly_data[1]', weekly_data[1]
    df_1 = spark.read.option("basePath", "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-rank.v1/").parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-rank.v1/fact/granularity=daily/date={%s}/" % ",".join(weekly_data[1]))
    df_2 = spark.read.option("basePath", "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/").parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date={%s}/" % ",".join(weekly_data[1]))
    df_3 = spark.read.option("basePath", "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-rank.v1/").parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-rank.v1/fact/granularity=weekly/date=%s/" % weekly_data[0])

    df_1.createOrReplaceTempView("daily_rank")
    df_2.createOrReplaceTempView("daily_est")
    df_3.createOrReplaceTempView("weekly_rank")

    spark.sql('''select daily_est.free_app_download, daily_est.paid_app_download, daily_est.revenue, daily_est.app_id, daily_est.device_code, daily_est.country_code, daily_rank.category_id from daily_rank 
                full outer join daily_est 
                on daily_rank.country_code= daily_est.country_code
                and daily_rank.device_code=daily_est.device_code
                and daily_rank.app_id = daily_est.app_id 
                and daily_rank.date = daily_est.date''').createOrReplaceTempView("daily_est_rank_join")


    spark.sql("select app_id, country_code, sum(free_app_download) as free_app_download, device_code, category_id from daily_est_rank_join group by country_code, device_code, category_id, app_id ").createOrReplaceTempView("sum_daily")
        
    spark.sql(''' select *, 
                    ROW_NUMBER() OVER (PARTITION BY device_code, country_code, category_id ORDER BY free_app_download DESC) rank 
                    from sum_daily 
                    where country_code='US' and category_id=100012 and device_code='ios-all' 
                        order by free_app_download desc''').createOrReplaceTempView("sum_rank_daily")
                        
    spark.sql('''select app_id, country_code, free_app_download, device_code, category_id 
                    from weekly_rank 
                    where country_code='US' 
                        and device_code='ios-all' 
                        and category_id=100012  
                        and free_app_download is not null 
                        order by free_app_download desc''').createOrReplaceTempView("weekly_rank_1")


sc.parallelize(map(check_diff, test_path), 1)

spark.sql('''
SELECT * FROM (SELECT * FROM weekly_rank_1 
where free_app_download <= 70
) AS test_unified
EXCEPT
SELECT app_id, country_code, rank as free_app_download, device_code, category_id  FROM (
SELECT * FROM sum_rank_daily 
INNER JOIN
( SELECT free_app_download 
FROM   (SELECT Count(1) AS free_app_download_count, 
              free_app_download 
        FROM   sum_rank_daily 
        WHERE  free_app_download IS NOT NULL 
        GROUP  BY free_app_download) AS t 
WHERE  free_app_download_count = 1  
) AS filter_count

ON 
sum_rank_daily.free_app_download = filter_count.free_app_download
WHERE rank  <= 70
) test_raw order by free_app_download asc
''').show()


spark.sql('''
SELECT app_id, country_code, rank as free_app_download, device_code, category_id  FROM (
SELECT * FROM sum_rank_daily 
INNER JOIN
( SELECT free_app_download 
FROM   (SELECT Count(1) AS free_app_download_count, 
               free_app_download 
        FROM   sum_rank_daily 
        WHERE  free_app_download IS NOT NULL 
        GROUP  BY free_app_download) AS t 
WHERE  free_app_download_count = 1  
) AS filter_count

ON 
sum_rank_daily.free_app_download = filter_count.free_app_download
WHERE rank <= 70
) test_raw 

EXCEPT

SELECT * FROM (SELECT * FROM weekly_rank_1 
where free_app_download <= 70
) AS test_unified order by free_app_download asc

''').show()



In [0]:

spark.sql("select * from daily_rank where app_id in (373998688) and country_code='US' and device_code='ios-all' order by category_id, date desc  ").show(200)
spark.sql("select * from daily_est where app_id in (373998688) and country_code='US' and device_code='ios-all' order by date desc ").show()
spark.sql("select * from daily_est_rank_join where app_id in (373998688) and country_code='US' and device_code='ios-all' and category_id=100012 ").show()
spark.sql("select * from sum_rank_daily where app_id in (373998688)").show()
spark.sql("select * from weekly_rank where app_id in (373998688) and country_code='US' and device_code='ios-all' and category_id=100012").show()


In [0]:

spark.sql("select * from weekly_rank where  country_code='US' and device_code='ios-all' and category_id=100012 order by free_app_download asc ").show()


In [0]:

df_raw_debug = spark.read.option("basePath", "s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/").parquet("s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/date={2019-07-27,2019-07-26,2019-07-25,2019-07-24,2019-07-23,2019-07-22,2019-07-21}").cache()
df_raw_debug.where("store_id=143441 and id=1030437345 and platform='ios' and category_id=7011").orderBy("feed").show()

In [0]:

from pyspark.sql import types as T
from pyspark.sql import functions as F

csv_schema = T.StructType(
    [
        T.StructField("store_id", T.IntegerType(), True),
        T.StructField("date", T.StringType(), True),
        T.StructField("platform_id", T.IntegerType(), True),
        T.StructField("vertical", T.IntegerType(), True),
        T.StructField("feed", T.IntegerType(), True),
        T.StructField("id", T.LongType(), True),
        T.StructField("est", T.IntegerType(), True),
        T.StructField("category", T.IntegerType(), True),
        T.StructField("rank", T.IntegerType(), True)
    ]
)
test_date='2010-07-04,2010-07-05,2010-07-06,2010-07-07,2010-07-08,2010-07-09,2010-07-10'
raw2= spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(csv_schema).csv( "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/ios/sbe_est_app/*/"%(test_date), sep="\t").withColumn("platform", F.lit("ios")).cache()
raw2.filter("store_id=143441 and id=373998688 and platform='ios'  ").show(1000)


In [0]:


spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_est/v_final/WEEK/").csv( "s3://b2c-prod-dca-store-estimates/store_est/v_final/WEEK/2018-08-{04,11,18,25}/ios/sbe_est_app/143441/", sep="\t").createOrReplaceTempView("weekly_data")
spark.sql("select * from weekly_data where _c0=1015763729").show()

# spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").csv( "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/2010-07-{04,05,06,07,08,09,10}/ios/sbe_est_app/143441/", sep="\t").createOrReplaceTempView("raw_data_daily")
# spark.sql("select *, cast(_c4 as int) as t1, cast(_c7 as int) as _c7 from raw_data_daily where _c5=373998688 order by t1 , _c7 desc   ").show(200)


# monthly data
spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_est/v_final/MONTH/").csv( "s3://b2c-prod-dca-store-estimates/store_est/v_final/MONTH/2018-08-31/ios/sbe_est_app/143441/", sep="\t").createOrReplaceTempView("monthly_data")
spark.sql("select * from monthly_data where _c0=1015763729").show()

spark.sql('''
SELECT _c1,_c0,_c2, SUM(est) 
FROM
(
    SELECT *, CAST(_c10 as int) AS est FROM weekly_data
)
GROUP BY est,_c1,_c0,_c2 where _c0= 1015763729 ''').show()
# spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_est/v_final/WEEK/").csv( "s3://b2c-prod-dca-store-estimates/store_est/v_final/WEEK/2018-08-{05,06,07,08,09,10,11}/ios/sbe_est_app/143441/", sep="\t").createOrReplaceTempView("raw_data_ha")
# spark.sql("select * from raw_data_ha where _c0=1332651082 and _c7='7011'").show()


In [0]:

spark.sql("select * from sum_rank_daily").show()

In [0]:




print spark.sql('''
SELECT * FROM (SELECT * FROM weekly_rank_1 
where free_app_download <= 60
) AS test_unified
EXCEPT
SELECT app_id, country_code, rank as free_app_download, device_code, category_id  FROM (
SELECT * FROM sum_rank_daily 
INNER JOIN
( SELECT free_app_download 
FROM   (SELECT Count(1) AS free_app_download_count, 
              free_app_download 
        FROM   sum_rank_daily 
        WHERE  free_app_download IS NOT NULL 
        GROUP  BY free_app_download) AS t 
WHERE  free_app_download_count = 1  
) AS filter_count

ON 
sum_rank_daily.free_app_download = filter_count.free_app_download
WHERE rank  <= 60
) test_raw order by free_app_download asc
''').show()


print spark.sql('''
SELECT app_id, country_code, rank as free_app_download, device_code, category_id  FROM (
SELECT * FROM sum_rank_daily 
INNER JOIN
( SELECT free_app_download 
FROM   (SELECT Count(1) AS free_app_download_count, 
               free_app_download 
        FROM   sum_rank_daily 
        WHERE  free_app_download IS NOT NULL 
        GROUP  BY free_app_download) AS t 
WHERE  free_app_download_count = 1  
) AS filter_count

ON 
sum_rank_daily.free_app_download = filter_count.free_app_download
WHERE rank <= 60
) test_raw 

EXCEPT

SELECT * FROM (SELECT * FROM weekly_rank_1 
where free_app_download <= 60
) AS test_unified order by free_app_download asc

''').show()





In [0]:




print spark.sql('''
SELECT * FROM (
SELECT * FROM sum_rank_daily 
INNER JOIN
( SELECT free_app_download 
FROM   (SELECT Count(1) AS free_app_download_count, 
               free_app_download 
        FROM   sum_rank_daily 
        WHERE  free_app_download IS NOT NULL 
        GROUP  BY free_app_download) AS t 
WHERE  free_app_download_count = 1  
) AS filter_count

ON 
sum_rank_daily.free_app_download = filter_count.free_app_download
) test_raw EXCEPT 
SELECT * FROM (SELECT * FROM weekly_rank 
INNER JOIN
( SELECT free_app_download 
FROM   (SELECT COUNT(1) AS free_app_download_count, 
               free_app_download 
        FROM   weekly_rank 
        WHERE  free_app_download IS NOT NULL 
        GROUP  BY free_app_download) AS t 
WHERE  free_app_download_count = 1  
) AS filter_count_weekly

ON 
weekly_rank.free_app_download = filter_count_weekly.free_app_download
) AS test_unified
''').show()



In [0]:

print spark.sql("select * from sum_rank_daily where free_app_download is not null and country_code='US' and category_id=100012 and device_code='ios-phone'  order by free_app_download desc    ").show(250)
print spark.sql("select * from weekly_rank where free_app_download is not null and country_code='US' and category_id=100012 and device_code='ios-phone'  order by free_app_download asc ").show(250)


In [0]:

spark.sql("select * from daily_join  where app_id=373998688 and country_code='US' and device_code='ios-all' and category_id=100012").show()
# spark.sql('''select * from sum_rank_daily where free_app_download is not null order by free_app_download desc''').show(70)
# spark.sql('''select * from weekly_rank where free_app_download is not null order by free_app_download desc''').show(70)


In [0]:

print spark.sql('''
SELECT * FROM weekly_rank 
INNER JOIN
( SELECT free_app_download 
FROM   (SELECT COUNT(1) AS free_app_download_count, 
               free_app_download 
        FROM   weekly_rank 
        WHERE  free_app_download IS NOT NULL 
        GROUP  BY free_app_download) AS t 
WHERE  free_app_download_count = 1  
) AS filter_count_weekly

ON 
weekly_rank.free_app_download = filter_count_weekly.free_app_download
''').show()


In [0]:

spark.sql('''
SELECT COUNT(1) AS free_app_download_count, 
               free_app_download 
        FROM   weekly_rank 
        WHERE  free_app_download IS NOT NULL 
        GROUP  BY free_app_download
        ORDER BY free_app_download_count ASC
''').show()

In [0]:

print spark.sql("select * from sum_rank_daily where free_app_download is not null and country_code='US' and category_id=100012 and device_code='ios-all' order by  free_app_download desc   ").show(250)
print spark.sql("select * from weekly_join where free_app_download is not null and country_code='US' and category_id=100012 and device_code='ios-all'  order by  free_app_download desc ").show(250)

In [0]:

# df_write_result.createOrReplaceTempView("df_write_result")
spark.sql("select * from count_daily where category_id = 100034 and device_code='ios-all' and country_code='AR' order by total_daily_count desc").show(2264)
# spark.sql("select * from join_category_daily where country_code='AR' and category_id='100010' and device_code='ios-all' and app_id=336904996 ").show()
# spark.sql("select count(*) from daily_free_app").show()
# spark.sql("select * from weekly where country_code='AR' and category_id='100010' and device_code='ios-all' and app_id=336904996 ").show(20)
# spark.sql("select * from daily_free_app where country_code='AR' and category_id='100010' and device_code='ios-all' and app_id=336904996 ").show(20)

# spark.sql("select * from order_sum_daily_free_download_dense_rank where country_code='CO' and category_id='100009' and device_code='ios-all'  ").show(20)
# spark.sql("select * from order_sum_daily_free_download_row_number where country_code='CO' and category_id='100009' and device_code='ios-all'  ").show(20)


# spark.sql("select * from weekly where country_code='AE' and category_id='100009'  and device_code='ios-all' order by free_app_download , app_id asc ").show(50)
# spark.sql("select * from sum_daily where country_code='CO' and category_id='100009' and app_id in (311992257,369522274) and device_code='ios-all'  ").show()
# spark.sql("select * from order_sum_daily_free_download where country_code='CO' and category_id='100009' and app_id in (311992257,369522274) and device_code='ios-all'  ").show()
# spark.sql("select * from weekly where country_code='CO' and category_id='100009' and app_id in (311992257,369522274) and device_code='ios-all' ").show()

In [0]:


spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").parquet("s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/2020-03-07/ios/sbe_est_app/*/").registerTempTable("raw_cat_weekly_data")
spark.sql("""select * from raw_cat_weekly_data where id=335545504 and store_id=143481 and feed""").show()


spark.read.option("basePath","s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-rank.v1/fact").parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-rank.v1/fact/granularity=weekly/date={2020-03-07}/").registerTempTable("cat_weekly_data")
spark.sql("""select * from cat_weekly_data where app_id=335545504 and country_code=  'AE' and device_code='ios-tablet'""").show()


spark.read.option("basePath","s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-rank.v1/fact").parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-rank.v1/fact/granularity=daily/date={2020-03-04}/").registerTempTable("cat_data")
spark.sql("""select * from cat_data where app_id=335545504 and country_code=  'AE' and device_code='ios-tablet'""").show()

spark.read.option("basePath","s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact").parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date={2020-03-04}/").registerTempTable("estt_data")
spark.sql("""select * from estt_data where app_id=335545504 and country_code=  'AE' and device_code='ios-tablet'""").show()



In [0]:
%%sh
aws s3 ls s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-rank.v1/fact/granularity=daily/date=2020-03-04  --recursive
aws s3 ls s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date=2020-03-04  --recursive

In [0]:
   
ANDROID_CATEGORIES = [
    (1, 400000), (2, 400001), (3, 400022), (4, 400023), (5, 400024),
    (6, 400008), (7, 400011), (8, 400014), (9, 400017), (10, 400020),
    (11, 400025), (12, 400030), (13, 400031), (14, 400032), (15, 400033),
    (16, 400035), (17, 400036), (18, 400038), (19, 400040), (20, 400042),
    (21, 400043), (22, 400044), (23, 400058), (24, 400046), (25, 400047),
    (26, 400048), (27, 400050), (28, 400051), (29, 400052), (30, 400053),
    (31, 400054), (32, 400055), (33, 400056), (34, 400045), (35, 400057),
    (36, 400059), (37, 400060), (38, 400002), (39, 400003), (40, 400021),
    (41, 400004), (42, 400005), (43, 400006), (44, 400007), (46, 400009),
    (47, 400010), (48, 400012), (49, 400013), (51, 400015), (52, 400016),
    (54, 400018), (55, 400019), (56, 400061), (57, 400063), (58, 400064),
    (59, 400065), (60, 400062), (61, 400066), (62, 400067), (63, 400068),
    (64, 400069), (65, 400070), (66, 400026), (67, 400027), (68, 400041),
    (69, 400028), (70, 400029), (71, 400034), (72, 400037), (73, 400039),
    (75, 400049)
]


IOS_CATEGORIES = [
    (36, 100000), (100, 100021), (360, 100030), (361, 100031), (362, 100032),
    (363, 100033), (6000, 100023), (6001, 100077), (6002, 100076), (6003, 100075),
    (6004, 100073), (6005, 100072), (6006, 100070), (6007, 100069), (6008, 100068),
    (6009, 100067), (6010, 100066), (6011, 100065), (6012, 100034), (6013, 100029),
    (6014, 100001), (6015, 100027), (6016, 100026), (6017, 100025), (6018, 100022),
    (6020, 100064), (6021, 100035), (6022, 100024), (6023, 100028), (6024, 100071),
    (6025, 100074), (7001, 100002), (7002, 100003), (7003, 100004), (7004, 100005),
    (7005, 100006), (7006, 100007), (7007, 100008), (7008, 100009), (7009, 100010),
    (7010, 100011), (7011, 100012), (7012, 100013), (7013, 100014), (7014, 100015),
    (7015, 100016), (7016, 100017), (7017, 100018), (7018, 100019), (7019, 100020),
    (13001, 100053), (13002, 100046), (13003, 100049), (13004, 100054), (13005, 100060),
    (13006, 100037), (13007, 100036), (13008, 100038), (13009, 100039), (13010, 100040),
    (13011, 100041), (13012, 100042), (13013, 100043), (13014, 100044), (13015, 100045),
    (13017, 100047), (13018, 100048), (13019, 100050), (13020, 100051), (13021, 100052),
    (13023, 100055), (13024, 100056), (13025, 100057), (13026, 100058), (13027, 100059),
    (13028, 100061), (13029, 100062), (13030, 100063)
]


IOS_STORE_COUNTRY_MAPPING = [
    (0, 'WW'), (143575, 'AL'), (143563, 'DZ'), (143564, 'AO'), (143538, 'AI'),
    (143540, 'AG'), (143505, 'AR'), (143524, 'AM'), (143460, 'AU'), (143445, 'AT'),
    (143568, 'AZ'), (143539, 'BS'), (143559, 'BH'), (143541, 'BB'), (143565, 'BY'),
    (143446, 'BE'), (143555, 'BZ'), (143576, 'BJ'), (143542, 'BM'), (143577, 'BT'),
    (143556, 'BO'), (143525, 'BW'), (143503, 'BR'), (143543, 'VG'), (143560, 'BN'),
    (143526, 'BG'), (143578, 'BF'), (143579, 'KH'), (143455, 'CA'), (143580, 'CV'),
    (143544, 'KY'), (143581, 'TD'), (143483, 'CL'), (143465, 'CN'), (143501, 'CO'),
    (143582, 'CG'), (143495, 'CR'), (143494, 'HR'), (143557, 'CY'), (143489, 'CZ'),
    (143458, 'DK'), (143545, 'DM'), (143508, 'DO'), (143509, 'EC'), (143516, 'EG'),
    (143506, 'SV'), (143518, 'EE'), (143583, 'FJ'), (143447, 'FI'), (143442, 'FR'),
    (143584, 'GM'), (143443, 'DE'), (143573, 'GH'), (143448, 'GR'), (143546, 'GD'),
    (143504, 'GT'), (143585, 'GW'), (143553, 'GY'), (143510, 'HN'), (143463, 'HK'),
    (143482, 'HU'), (143558, 'IS'), (143467, 'IN'), (143476, 'ID'), (143449, 'IE'),
    (143491, 'IL'), (143450, 'IT'), (143511, 'JM'), (143462, 'JP'), (143528, 'JO'),
    (143517, 'KZ'), (143529, 'KE'), (143493, 'KW'), (143586, 'KG'), (143587, 'LA'),
    (143519, 'LV'), (143497, 'LB'), (143588, 'LR'), (143520, 'LT'), (143451, 'LU'),
    (143515, 'MO'), (143530, 'MK'), (143531, 'MG'), (143589, 'MW'), (143473, 'MY'),
    (143532, 'ML'), (143521, 'MT'), (143590, 'MR'), (143533, 'MU'), (143468, 'MX'),
    (143591, 'FM'), (143523, 'MD'), (143592, 'MN'), (143547, 'MS'), (143593, 'MZ'),
    (143594, 'NA'), (143484, 'NP'), (143452, 'NL'), (143461, 'NZ'), (143512, 'NI'),
    (143534, 'NE'), (143561, 'NG'), (143457, 'NO'), (143562, 'OM'), (143477, 'PK'),
    (143595, 'PW'), (143485, 'PA'), (143597, 'PG'), (143513, 'PY'), (143507, 'PE'),
    (143474, 'PH'), (143478, 'PL'), (143453, 'PT'), (143498, 'QA'), (143487, 'RO'),
    (143469, 'RU'), (143598, 'ST'), (143479, 'SA'), (143535, 'SN'), (143599, 'SC'),
    (143600, 'SL'), (143464, 'SG'), (143496, 'SK'), (143499, 'SI'), (143601, 'SB'),
    (143472, 'ZA'), (143466, 'KR'), (143454, 'ES'), (143486, 'LK'), (143548, 'KN'),
    (143549, 'LC'), (143550, 'VC'), (143554, 'SR'), (143602, 'SZ'), (143456, 'SE'),
    (143459, 'CH'), (143470, 'TW'), (143603, 'TJ'), (143572, 'TZ'), (143475, 'TH'),
    (143551, 'TT'), (143536, 'TN'), (143480, 'TR'), (143604, 'TM'), (143552, 'TC'),
    (143537, 'UG'), (143492, 'UA'), (143481, 'AE'), (143444, 'GB'), (143441, 'US'),
    (143514, 'UY'), (143566, 'UZ'), (143502, 'VE'), (143471, 'VN'), (143571, 'YE'),
    (143605, 'ZW')]
    
# IOS_STORE_COUNTRY_MAPPING = [
#     (0, 'WW'), (143441, 'US'), (143465, 'CN'), (143460, 'AU'), (143444, 'GB')
#     ]

ANDROID_STORE_COUNTRY_MAPPING = [
    (17, 'AR'), (1, 'AU'), (35, 'AT'), (61, 'AZ'), (11, 'BE'), (18, 'BR'), (47, 'BG'),
    (2, 'CA'), (13, 'CL'), (3, 'CN'), (52, 'CO'), (64, 'CR'), (80, 'HR'), (36, 'CZ'),
    (38, 'DK'), (62, 'EC'), (33, 'EG'), (20, 'FI'), (6, 'FR'), (4, 'DE'), (46, 'GR'),
    (16, 'HK'), (37, 'HU'), (19, 'IN'), (21, 'ID'), (39, 'IE'), (40, 'IL'), (8, 'IT'),
    (9, 'JP'), (53, 'KZ'), (95, 'KE'), (50, 'KW'), (86, 'LV'), (65, 'LB'), (78, 'LT'),
    (24, 'MY'), (26, 'MX'), (23, 'NL'), (41, 'NZ'), (74, 'NG'), (42, 'NO'), (54, 'PK'),
    (56, 'PE'), (31, 'PH'), (28, 'PL'), (43, 'PT'), (84, 'PR'), (73, 'QA'), (44, 'RO'),
    (22, 'RU'), (51, 'SA'), (32, 'SG'), (45, 'SK'), (14, 'ZA'), (27, 'KR'), (5, 'ES'),
    (34, 'SE'), (12, 'CH'), (30, 'TW'), (29, 'TH'), (25, 'TR'), (48, 'UA'), (49, 'AE'),
    (7, 'GB'), (10, 'US'), (15, 'VN'), (1000, 'WW')
]
# ANDROID_STORE_COUNTRY_MAPPING = [
#   (12, 'CH'), (30, 'TW'), (25, 'TR'), (48, 'UA'),
#     (7, 'GB'), (10, 'US'), (1000, 'WW')
# ]



def device_code_to_feed(market_code, device_code, metric_name):
    mapping = [
        ['apple-store',0,'ios-phone','est_free_app_download'],
        ['apple-store',1,'ios-phone','est_paid_app_download'],
        ['apple-store',2,'ios-phone','est_revenue'],
        ['apple-store',101,'ios-tablet','est_free_app_download'],
        ['apple-store',100,'ios-tablet','est_paid_app_download'],
        ['apple-store',102,'ios-tablet','est_revenue'],
        ['google-play',0,'android-all','est_free_app_download'],
        ['google-play',1,'android-all','est_paid_app_download'],
        ['google-play',2,'android-all','est_revenue'],
    ]
    return [x for x in mapping if (x[0], x[2], x[3]) == (market_code, device_code, metric_name)][0][1]



def country_code_to_id(market_code, code):
    if market_code == 'apple-store':
        ios_mapping = {_code:_id for (_id, _code) in IOS_STORE_COUNTRY_MAPPING}
        return ios_mapping[code]
    else:
        gp_mapping = {_code:_id for (_id, _code) in ANDROID_STORE_COUNTRY_MAPPING}
        return gp_mapping[code]


def category_to_legacy_category(market_code, legacy):
    if market_code == 'apple-store':
        ios_category = {_category:_legacy_category for (_legacy_category,_category) in IOS_CATEGORIES }
        return ios_category[legacy]
    else:
        gp_category =  {_category:_legacy_category for (_legacy_category,_category) in ANDROID_CATEGORIES }
        return gp_category[legacy]



def id_to_country_code(market_code, store_id):
    if market_code == 'ios':
        ios_mapping = {store_id:_code for (store_id, _code) in IOS_STORE_COUNTRY_MAPPING}
        print ios_mapping
        return ios_mapping[store_id]
    else:
        gp_mapping = {store_id:_code for (store_id, _code) in ANDROID_STORE_COUNTRY_MAPPING}
        return gp_mapping[store_id]



id_to_country_code("test",15)
# 13028, 100061
# 71, 400034
# category_to_legacy_category("apple-store",100000)


In [0]:

from pyspark.sql import functions as F
import datetime

start_week = "2019-07-21"
end_week = "2019-07-28"
# end = "2012-05-01"
real_date1 = datetime.date(*[int(x) for x in start_week.split('-')])
real_date2 = datetime.date(*[int(x) for x in end_week.split('-')])
date_range = real_date2 - real_date1
dates = list()
sar_list=list()
for days in xrange(date_range.days):
    dates.append(real_date1 + datetime.timedelta(days))
    if (real_date1 + datetime.timedelta(days)).weekday() == 5:
        temp=list()
        while dates:
            temp.append(dates.pop())
        sar_list.append({real_date1 + datetime.timedelta(days):temp})

test_path=list()
for x in sar_list:
    for key,item in x.items():
        test_path.append((key.strftime("%Y-%m-%d"),[d.strftime("%Y-%m-%d") for d in item]))
        
        
        
def check_diff(weekly_data):
    df_1 = spark.read.option("basePath", "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-rank.v1/").parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-rank.v1/fact/granularity=daily/date={%s}/" % ",".join(weekly_data[1]))
    df_2 = spark.read.option("basePath", "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/").parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date={%s}/" % ",".join(weekly_data[1]))
    df_3 = spark.read.option("basePath", "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-rank.v1/").parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-rank.v1/fact/granularity=weekly/date=%s/" % weekly_data[0])
    # df_4 = spark.read.option("basePath", "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/").parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=weekly/date=%s/" % weekly_data[0])

    df_1.createOrReplaceTempView("daily_rank")
    df_2.createOrReplaceTempView("daily_est")
    df_3.createOrReplaceTempView("weekly_rank")
    # df_4.createOrReplaceTempView("weekly_est")
    
    # print "compare app id: " , weekly_data
    
    spark.sql('''select daily_est.free_app_download, daily_est.paid_app_download, daily_est.revenue, daily_est.app_id, daily_est.device_code, daily_est.country_code, daily_rank.category_id from daily_rank 
                full outer join daily_est 
                on daily_rank.country_code= daily_est.country_code
                and daily_rank.device_code=daily_est.device_code
                and daily_rank.app_id = daily_est.app_id 
                and daily_rank.date = daily_est.date''').createOrReplaceTempView("daily_join")

    # spark.sql('''select weekly_est.free_app_download, weekly_rank.free_app_download as rank, weekly_est.paid_app_download, weekly_est.revenue, weekly_est.app_id, weekly_est.device_code, weekly_est.country_code, weekly_rank.category_id from weekly_rank 
    #             join weekly_est 
    #             on weekly_rank.country_code= weekly_est.country_code
    #             and weekly_rank.device_code=weekly_est.device_code
    #             and weekly_rank.app_id = weekly_est.app_id 
    #             and weekly_rank.date = weekly_est.date ''').createOrReplaceTempView("weekly_join")



    spark.sql("select app_id, country_code, sum(free_app_download) as free_app_download, device_code, category_id from daily_join group by country_code, device_code, category_id, app_id ").createOrReplaceTempView("sum_daily")
        
    # # spark.sql("select app_id, country_code, free_app_download, device_code, category_id from sum_daily where free_app_download is not null").createOrReplaceTempView("sum_daily_free_download")
    # spark.sql("select * from weekly_join").show()

    # spark.sql('''select app_id, country_code, device_code, category_id, free_app_download, ROW_NUMBER() OVER (PARTITION BY device_code, country_code, category_id ORDER BY free_app_download DESC) free_app_download_rank from sum_daily''').createOrReplaceTempView("order_sum_daily_free_download_rank")


    

    # spark.sql('''select count(*) from 
    #                 (select app_id, country_code,free_app_download,device_code,category_id from order_sum_daily_free_download_rank except all select app_id, country_code,free_app_download,device_code,category_id from weekly) as test''').withColumn("date", F.lit(weekly_data[0]) ).withColumn("type",F.lit("daily_weekly")).show()

    spark.sql(''' select *, 
                    ROW_NUMBER() OVER (PARTITION BY device_code, country_code, category_id ORDER BY free_app_download DESC) rank 
                    from sum_daily 
                    where country_code='US' and category_id=100013 and device_code='ios-phone' 
                        order by free_app_download desc''').createOrReplaceTempView("sum_rank_daily")
    spark.sql('''select app_id, country_code, free_app_download, device_code, category_id 
                    from weekly_rank 
                    where country_code='US' 
                        and device_code='ios-phone' 
                        and category_id=100013  
                        and free_app_download is not null 
                        order by free_app_download desc''').createOrReplaceTempView("weekly_rank_1")
                        
    # spark.sql('''select count(*) from ( select * from sum_rank_daily where free_app_download is not null except select * from weekly_rank ) as t1 ''').show()
    # spark.sql('''select count(*) from ( select * from weekly_rank except select * from sum_rank_daily where free_app_download is not null ) as t2 ''').show()
    # spark.sql('''select count(*) from sum_rank_daily where free_app_download is not null ''').show()


sc.parallelize(map(check_diff, test_path), 1)

In [0]:

from pyspark.sql.types import StructType, StructField, StringType, IntegerType
def get_date_list(start_date, end_date, freq="D"):
    import pandas as pd
    """
    freq:   D: calendar day frequency
            M: month end frequency
            MS: month start frequency
            A, Y: year end frequency
            AS, YS: year start frequency
    """
    date_list = [x.strftime('%Y-%m-%d') for x in list(pd.date_range(start=start_date, end=end_date, freq=freq))]
    return date_list
dates = get_date_list('2010-07-04', '2010-07-31', freq='D')    # >> ['2010-07-04', '2010-07-05', '2010-07-06']
dates = dates + get_date_list('2010-08-31', '2018-12-31', freq='M')
print dates
# mapping:  ios-phone,    ios-all,     ios-tablet,  free_app_download=2, facebook
app_list = ['377194688', '364709193', '379174209',  '339739007',         '284882215']
device_code = ['ios-phone','ios-tablet','ios-all']    # exclude 'android-all'
countries = ['WW', 'US']
def main_test(dates, app_list, device_code, countries):
    where_clause = "app_id in ({}) and device_code in ('{}') and country_code in ('{}')".format(",".join(map(str,app_list)),   "','".join(device_code), "','".join(countries) )
    # >> "app_id in (377194688,364709193,379174209,339739007,284882215) and device_code in ('ios-phone','ios-tablet','ios-all') and country_code in ('WW','US')"
    print where_clause
    # fetch data from store-est and aggregrate "free_app_download"
    schema = StructType([
        StructField("app_id", StringType(), True), 
        StructField("device_code", StringType(), True),
        StructField("country_code", StringType(), True),
        StructField("free_app_download", IntegerType(), True),
        # StructField("paid_app_download", IntegerType(), True)
        ])
    uni_df = spark.createDataFrame([], schema=schema)
    for date in dates:
        unified_data = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date={}".format(date)).where(where_clause).select('app_id', 'device_code', 'country_code', 'free_app_download')
        uni_df = uni_df.union(unified_data).groupby('app_id', 'device_code', 'country_code').agg({'free_app_download': 'sum'}).withColumnRenamed('sum(free_app_download)', 'free_app_download')
    # fetch data from cumulative
    cum_df = spark.read.format('delta').load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-cum.v1/fact/date={}/".format(dates[-1])).where(where_clause).select('app_id', 'device_code', 'country_code', 'free_app_download')
    res_df = uni_df.subtract(cum_df)
    return res_df
a = main_test(dates, app_list, device_code, countries)
a.show()
print len(a.collect())

