In [0]:

import datetime
from pyspark.sql import types as T
from pyspark.sql import functions as F

from aadatapipelinecore.core.urn import Urn
from aadatapipelinecore.core.pipeline import type_
from applications.common.parser import SqlParser
from applications.common.executor import SqlExecutor
from applications.auto_pipeline.transform import _view
from aadatapipelinecore.core.utils.spark import eject_all_caches
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

spark.sparkContext.addPyFile("/home/hadoop/bdp/application/libs/python/dependencies.zip")
import aaplproxy

"""
get date:  [ [month, [days]], [month, [days]], [month, [days]], ....... ]
"""
def get_date_list(start_date, end_date, freq="D"):
    import pandas as pd
    """
    freq:   D: calendar day frequency
            M: month end frequency
            MS: month start frequency
            A, Y: year end frequency
            AS, YS: year start frequency
    """
    date_list = [x.strftime('%Y-%m-%d') for x in list(pd.date_range(start=start_date, end=end_date, freq=freq))]
    return date_list

# start = "2010-07-31"  # prod
start = "2018-09-30"    # test
end = start

monthly = get_date_list(start, end, freq='M')
print monthly


date_list = []
for m in monthly:
    d = get_date_list(m[:8]+'01', m, freq='D')  # start = the first day of each month; end = each month
    month_and_day = [m, d]
    date_list.append(month_and_day)

print date_list



class DryRunSqlExecutor(SqlExecutor):
    def _verify_tasks(self):
        pass


def run(spark, raw_data, sql_text, dry_run=True):
    urn = Urn(namespace=raw_data["namespace"])
    source_data_list = raw_data.pop("source")
    raw_data.update(raw_data.pop("options"))
    _view(spark, sql_text, None, source_data_list)
    context = raw_data
    tasks = SqlParser(spark, sql_text, context).parse()
    if dry_run:
        sql_executor = DryRunSqlExecutor
    else:
        sql_executor = SqlExecutor
    sql_executor(urn, spark, tasks, type_.EventType.TRANSFORM, context).run()

# CSV schema
from pyspark.sql import types as T
from pyspark.sql import functions as F

csv_schema = T.StructType(
    [
        T.StructField("store_id", T.IntegerType(), True),
        T.StructField("date", T.DateType(), True),
        T.StructField("platform_id", T.IntegerType(), True),
        T.StructField("vertical", T.IntegerType(), True),
        T.StructField("feed", T.IntegerType(), True),
        T.StructField("id", T.LongType(), True),
        T.StructField("est", T.IntegerType(), True),
        T.StructField("category_id", T.IntegerType(), True),
        T.StructField("rank", T.IntegerType(), True)
    ]
)

def test_monthkly_data(test_data):
    month_indicator = test_data[0]
    '''
    ### 1. only csv, but date range is '2010-07-04' to '2010-07-31' ###
    if month_indicator == '2010-07-31':
        temp_date_range = get_date_list('2010-07-04', '2010-07-31', freq='D')
        df_1 = spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(csv_schema).csv( "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/ios/sbe_est_app/*/"%",".join(temp_date_range), sep="\t").withColumn("platform", F.lit("ios")).select('id', 'store_id', 'category_id', 'platform_id', 'vertical', 'rank', 'feed', 'est', 'date', 'platform').cache()
        

    ### 2. only csv
    elif month_indicator > '2010-08-01' and month_indicator < '2019-07-01':
        df_ios = spark.read.option("basePath",
                                   "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(
            csv_schema).csv(
            "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/ios/sbe_est_app/*/" % (','.join(test_data[1])),
            sep="\t").withColumn("platform", F.lit("ios")).select('id', 'store_id', 'category_id',
                                                                  'platform_id', 'vertical', 'rank', 'feed',
                                                                  'est', 'date', 'platform').cache()
        df_android = spark.read.option("basePath",
                                       "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(
            csv_schema).csv(
            "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/android/sbe_est_app/*/" % (
                ','.join(test_data[1])), sep="\t").withColumn("platform", F.lit("android")).select('id', 'store_id',
                                                                                      'category_id',
                                                                                      'platform_id',
                                                                                      'vertical', 'rank',
                                                                                      'feed', 'est', 'date',
                                                                                      'platform').cache()
        df_1 = df_ios.union(df_android)
        

    ### 3. half is csv, half is parquet ###
    elif month_indicator == '2019-07-31':
        # First half of 2019-07
        temp_date_range = get_date_list('2019-07-01', '2019-07-14')
        first_half_month_df = spark.read.option("basePath", "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(csv_schema).csv("s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/ios/sbe_est_app/*/"%",".join(temp_date_range), sep="\t").withColumn("platform", F.lit("ios")).select('id', 'store_id', 'category_id', 'platform_id', 'vertical', 'rank', 'feed', 'est', 'date', 'platform').cache()
        # Second half of 2019-07
        temp_date_range = get_date_list('2019-07-15', '2019-07-31')
        second_half_month = spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/").parquet("s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/date={%s}/platform=*/*/" %  ",".join(temp_date_range)).select('id', 'store_id', 'category_id', 'platform_id', 'vertical', 'rank', 'feed', 'est', 'date', 'platform').cache()
        df_1 = first_half_month_df.union(second_half_month)
        

    ### 4. only parquet ###
    else:  # month_indicator >= '2019-08-31'
    '''
    temp_date_range = get_date_list('2019-10-01', '2019-12-31')
    df_1 = spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/").parquet("s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/date={%s}/platform=*/*/" %  ",".join(temp_date_range)) .cache()

    df_1.createOrReplaceTempView("daily_data")
    
    
    
    quarterly_df_ho = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-pre-aggr.v3/fact/").where("granularity='quarterly' and date='{}' and data_stage='final'".format('2019-12-31')).cache()
    quarterly_df_ho.createOrReplaceTempView("unified_quarterly")

    sql_text = """
    
    WITH filter_top_N_raw_data AS(
    SELECT
     distinct
      id,
      Sum(est) AS est,
      store_id,
      platform_id,
      feed,
      vertical,
      platform
    FROM
      (
        SELECT
          DISTINCT d1.id,
          d1.est,
          d1.store_id,
          d1.date,
          d1.feed,
          d1.vertical,
          d1.platform_id,
          d1.platform
        FROM
          daily_data AS d1
          JOIN daily_data AS d2 
          ON d1.id = d2.id
          AND d1.store_id = d2.store_id
          AND d1.feed = d2.feed
          AND d1.vertical = d2.vertical
          AND d1.platform_id = d2.platform_id
        WHERE (d1.rank <= 4000 and d2.rank<=4000 and d1.store_id == 0 and d1.platform = 'ios' ) 
            OR (d1.rank <= 1000 and d2.rank<=1000 and  d1.store_id != 0 and d1.platform = 'ios' )
            OR  (d1.rank <= 4000 and d2.rank<=4000 and d1.store_id == 1000 and d1.platform = 'android' ) 
            OR (d1.rank <= 1000 and d2.rank<=1000 and  d1.store_id != 1000 and d1.platform = 'android' )
      ) AS t
    WHERE
      feed IN (
        0,
        1,
        2,
        101,
        100,
        102
      )
    GROUP BY
      id,
      store_id,
      platform_id,
      vertical,
      feed,
      platform);
      
     WITH replace_metric AS (
     SELECT * ,
         case 
        when feed='0' and platform='ios' then 'free_app_download'
        when feed='1' and platform='ios' then 'paid_app_download'
        when feed='2' and platform='ios' then 'revenue' 
        when feed='101' and platform='ios' then 'free_app_download' 
        when feed='100' and platform='ios' then 'paid_app_download' 
        when feed='102' and platform='ios' then 'revenue' 
        when feed='0' and platform='android' then 'free_app_download' 
        when feed='1' and platform='android' then 'paid_app_download' 
        when feed='2' and platform='android' then 'revenue' 
        end as metric from filter_top_N_raw_data);
      
      
         WITH replace_metric_device_code AS (
        SELECT * ,
         case 
        when feed='0' and platform='ios' then 'ios-phone'
        when feed='1' and platform='ios' then 'ios-phone'
        when feed='2' and platform='ios' then 'ios-phone' 
        when feed='101' and platform='ios' then 'ios-tablet' 
        when feed='100' and platform='ios' then 'ios-tablet' 
        when feed='102' and platform='ios' then 'ios-tablet' 
        when feed='0' and platform='android' then 'android-all' 
        when feed='1' and platform='android' then 'android-all' 
        when feed='2' and platform='android' then 'android-all' 
        end as device_code from replace_metric);


    WITH group_by_metric_1 AS (
        SELECT max(est) as est, id, metric,device_code, store_id, platform from replace_metric_device_code where store_id not in (3,4,5,6) and device_code in ('ios-phone' ,'ios-tablet' ) and feed not in (1000, 1001, 1002) group by id, store_id, metric,device_code, platform
        );
        
    WITH group_by_metric_2 AS (
        SELECT max(est) as est, id, metric,device_code, store_id, platform from replace_metric_device_code where store_id not in ( 1003, 1005, 1006,1007) and device_code='android-all' and feed not in (1000, 1001, 1002) group by id, store_id, metric,device_code, platform
    );
    
    WITH group_by_metric AS(
        SELECT * FROM group_by_metric_1
        UNION ALL
        SELECT * FROM group_by_metric_2
        );

      -- pivot metric column
    WITH pivot_metric_raw AS (

    SELECT 
        distinct id as app_id, store_id, platform, device_code, free_app_download,revenue, paid_app_download
    FROM
          group_by_metric
     PIVOT (
        max(est) 
    	FOR metric IN ('free_app_download','revenue', 'paid_app_download')
      )
    );
    
    
    -- union all platform with country_code mapping

    WITH country_code_mapping AS (
    select *, 'android' as market_code from android_country_mapping 
    UNION ALL select *, 'ios' market_code from ios_country_mapping
    UNION ALL select 143502, 'VE', 'VESA', 'ios'
    UNION ALL select 0, 'WW', 'worldwide', 'ios'
    UNION ALL select 36, 'CZ', 'CZ', 'android'
    UNION ALL select 5, 'ES', 'ES', 'android'

    );



    -- map raw with country_code

    WITH country_category_mapping_raw AS (
    select app_id, country_code, device_code, free_app_download, paid_app_download, revenue 
     from country_code_mapping 
     inner join 
         pivot_metric_raw 
     on 
         country_code_mapping.store_id=pivot_metric_raw.store_id 
     and 
         country_code_mapping.market_code=pivot_metric_raw.platform
    where country_name!='Global'
    );


      """
    
    # store_unified , rank_unified
    namespace = "aa.store.market-size.v1"
    ingest_msg = {
        "namespace": "aa.store.market-size.v1",
        "job_type": "routine",
        "options": {},
        "source": [
            {
            "data_encoding": "csv",
            "compression": "gzip",
            "name":"ios_country_mapping",
            "data_schema": [
                            {"name":"store_id","type":"int","nullable": False},
                            {"name":"country_code","type":"string","nullable": False},
                            {"name":"country_name","type":"string","nullable": False}
                            ],
             "csv_options": {
          'header': True,
          'sep': '\t',
          'quote': '',
          'encoding': 'utf-8',
          'escape': ''
          },

            "path": ["s3://b2c-prod-dca-store-estimates/store_back/dimension/IOS_COUNTRY_MAPPING"],
        },   
        {
            "data_encoding": "csv",
            "compression": "gzip",
            "name":"android_country_mapping",
            "data_schema": [
                            {"name":"store_id","type":"int","nullable": False},
                            {"name":"country_code","type":"string","nullable": False},
                            {"name":"country_name","type":"string","nullable": False}
                            ],
             "csv_options": {
          'header': True,
          'sep': '\t',
          'quote': '',
          'encoding': 'utf-8',
          'escape': ''
          },

            "path": ["s3://b2c-prod-dca-store-estimates/store_back/dimension/ANDROID_COUNTRY_MAPPING"],
        }
        ]
    }
    
    run(spark, ingest_msg, sql_text)
    eject_all_caches(spark)


sc.parallelize(map(test_monthkly_data, date_list), 1)
diff_df1 = spark.sql("select * from country_category_mapping_raw except all select app_id, country_code, device_code, free_app_download, paid_app_download, revenue from unified_quarterly ")
diff_df2 = spark.sql("select app_id, country_code, device_code, free_app_download, paid_app_download, revenue from unified_quarterly  except all select * from country_category_mapping_raw")

diff_df1.show()
diff_df2.show()


In [0]:

from pyspark.sql.functions import sum
from pyspark.sql.functions import desc


def get_date_list(start_date, end_date, freq="D"):
    import pandas as pd
    """
    freq:   D: calendar day frequency
            M: month end frequency
            MS: month start frequency
            A, Y: year end frequency
            AS, YS: year start frequency
    """
    date_list = [x.strftime('%Y-%m-%d') for x in list(pd.date_range(start=start_date, end=end_date, freq=freq))]
    return date_list
    

start = '2019-10-01'
end = '2019-12-31'

daily_df = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-load.v3/fact/").where("granularity='daily' and date between '{}' and '{}'".format(start, end))
daily_df = daily_df.na.fill(0)

agg_df = daily_df.groupBy('app_id', 'country_code', 'device_code').agg(sum('est_free_app_download').alias('free_app_download'), sum('est_paid_app_download').alias('paid_app_download'), sum('est_revenue').alias('revenue'))
# print agg_df.count()
# agg_df.orderBy(desc('free_app_download')).show(1000)
agg_df.createOrReplaceTempView("agg_df")
top_df = spark.sql("select * from agg_df where country_code='WW' order by free_app_download desc, app_id desc")
top_df.show(1000)


quarterly_df_ho = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-pre-aggr.v3/fact/").where("granularity='quarterly' and date='{}' and data_stage='final'".format(end))
# print quarterly_df_ho.count()
# quarterly_df_ho.show()
quarterly_df_ho = quarterly_df_ho.na.fill(0)

quarterly_df_ho.createOrReplaceTempView("quarterly_df")

diff_df1 = spark.sql("select app_id, country_code, device_code, free_app_download, paid_app_download, revenue from agg_df except all select app_id, country_code, device_code, free_app_download, paid_app_download, revenue from quarterly_df")
diff_df2 = spark.sql("select app_id, country_code, device_code, free_app_download, paid_app_download, revenue from quarterly_df except all select app_id, country_code, device_code, free_app_download, paid_app_download, revenue from agg_df")
print diff_df1.count(), diff_df2.count()
# diff_df1.show()
# diff_df2.show()



In [0]:

from pyspark.sql.functions import sum
from pyspark.sql.functions import desc


def get_date_list(start_date, end_date, freq="D"):
    import pandas as pd
    """
    freq:   D: calendar day frequency
            M: month end frequency
            MS: month start frequency
            A, Y: year end frequency
            AS, YS: year start frequency
    """
    date_list = [x.strftime('%Y-%m-%d') for x in list(pd.date_range(start=start_date, end=end_date, freq=freq))]
    return date_list
    

start = '2013-07-01'
end = '2013-09-30'

daily_df = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-load.v3/fact/").where("granularity='daily' and date between '{}' and '{}'".format(start, end))
daily_df = daily_df.na.fill(0)

agg_df = daily_df.groupBy('app_id', 'country_code', 'device_code').agg(sum('est_free_app_download').alias('free_app_download'), sum('est_paid_app_download').alias('paid_app_download'), sum('est_revenue').alias('revenue'))
# print agg_df.count()
# agg_df.orderBy(desc('free_app_download')).show(1000)
agg_df.createOrReplaceTempView("agg_df")
top_df = spark.sql("select * from agg_df where country_code='WW' order by free_app_download desc, app_id desc")
top_df.show(1000)


quarterly_df_ho = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-pre-aggr.v3/fact/").where("granularity='quarterly' and date='{}' and data_stage='final'".format(end))
# print quarterly_df_ho.count()
# quarterly_df_ho.show()
quarterly_df_ho = quarterly_df_ho.na.fill(0)

quarterly_df_ho.createOrReplaceTempView("quarterly_df")

diff_df1 = spark.sql("select app_id, country_code, device_code, free_app_download, paid_app_download, revenue from agg_df except all select app_id, country_code, device_code, free_app_download, paid_app_download, revenue from quarterly_df")
diff_df2 = spark.sql("select app_id, country_code, device_code, free_app_download, paid_app_download, revenue from quarterly_df except all select app_id, country_code, device_code, free_app_download, paid_app_download, revenue from agg_df")
print diff_df1.count(), diff_df2.count()
# diff_df1.show()
# diff_df2.show()


In [0]:

from pyspark.sql.functions import sum
from pyspark.sql.functions import desc


def get_date_list(start_date, end_date, freq="D"):
    import pandas as pd
    """
    freq:   D: calendar day frequency
            M: month end frequency
            MS: month start frequency
            A, Y: year end frequency
            AS, YS: year start frequency
    """
    date_list = [x.strftime('%Y-%m-%d') for x in list(pd.date_range(start=start_date, end=end_date, freq=freq))]
    return date_list
    

start = '2019-07-01'
end = '2019-09-30'

daily_df = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-load.v3/fact/").where("granularity='daily' and date between '{}' and '{}'".format(start, end))
daily_df = daily_df.na.fill(0)

agg_df = daily_df.groupBy('app_id', 'country_code', 'device_code').agg(sum('est_free_app_download').alias('free_app_download'), sum('est_paid_app_download').alias('paid_app_download'), sum('est_revenue').alias('revenue'))
# print agg_df.count()
# agg_df.orderBy(desc('free_app_download')).show(1000)
agg_df.createOrReplaceTempView("agg_df")
top_df = spark.sql("select * from agg_df where country_code='WW' order by free_app_download desc, app_id desc")
top_df.show(1000)


quarterly_df_ho = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-pre-aggr.v3/fact/").where("granularity='quarterly' and date='{}' and data_stage='final'".format(end))
# print quarterly_df_ho.count()
# quarterly_df_ho.show()
quarterly_df_ho = quarterly_df_ho.na.fill(0)

quarterly_df_ho.createOrReplaceTempView("quarterly_df")

diff_df1 = spark.sql("select app_id, country_code, device_code, free_app_download, paid_app_download, revenue from agg_df except all select app_id, country_code, device_code, free_app_download, paid_app_download, revenue from quarterly_df")
diff_df2 = spark.sql("select app_id, country_code, device_code, free_app_download, paid_app_download, revenue from quarterly_df except all select app_id, country_code, device_code, free_app_download, paid_app_download, revenue from agg_df")
print diff_df1.count(), diff_df2.count()
# diff_df1.show()
# diff_df2.show()

In [0]:

from pyspark.sql.functions import sum
from pyspark.sql.functions import desc


def get_date_list(start_date, end_date, freq="D"):
    import pandas as pd
    """
    freq:   D: calendar day frequency
            M: month end frequency
            MS: month start frequency
            A, Y: year end frequency
            AS, YS: year start frequency
    """
    date_list = [x.strftime('%Y-%m-%d') for x in list(pd.date_range(start=start_date, end=end_date, freq=freq))]
    return date_list
    

start = '2018-04-01'
end = '2018-06-30'

daily_df = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-load.v3/fact/").where("granularity='daily' and date between '{}' and '{}'".format(start, end))
daily_df = daily_df.na.fill(0)

agg_df = daily_df.groupBy('app_id', 'country_code', 'device_code').agg(sum('est_free_app_download').alias('free_app_download'), sum('est_paid_app_download').alias('paid_app_download'), sum('est_revenue').alias('revenue'))
# print agg_df.count()
# agg_df.orderBy(desc('free_app_download')).show(1000)
agg_df.createOrReplaceTempView("agg_df")
top_df = spark.sql("select * from agg_df where country_code='WW' order by free_app_download desc, app_id desc")
top_df.show(1000)


quarterly_df_ho = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-pre-aggr.v3/fact/").where("granularity='quarterly' and date='{}' and data_stage='final'".format(end))
# print quarterly_df_ho.count()
# quarterly_df_ho.show()
quarterly_df_ho = quarterly_df_ho.na.fill(0)

quarterly_df_ho.createOrReplaceTempView("quarterly_df")

diff_df1 = spark.sql("select app_id, country_code, device_code, free_app_download, paid_app_download, revenue from agg_df except all select app_id, country_code, device_code, free_app_download, paid_app_download, revenue from quarterly_df")
diff_df2 = spark.sql("select app_id, country_code, device_code, free_app_download, paid_app_download, revenue from quarterly_df except all select app_id, country_code, device_code, free_app_download, paid_app_download, revenue from agg_df")
print diff_df1.count(), diff_df2.count()
# diff_df1.show()
# diff_df2.show()

In [0]:

from pyspark.sql.functions import sum
from pyspark.sql.functions import desc


def get_date_list(start_date, end_date, freq="D"):
    import pandas as pd
    """
    freq:   D: calendar day frequency
            M: month end frequency
            MS: month start frequency
            A, Y: year end frequency
            AS, YS: year start frequency
    """
    date_list = [x.strftime('%Y-%m-%d') for x in list(pd.date_range(start=start_date, end=end_date, freq=freq))]
    return date_list
    

start = '2019-01-01'
end = '2019-12-31'

daily_df = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-load.v3/fact/").where("granularity='daily' and date between '{}' and '{}'".format(start, end))
daily_df = daily_df.na.fill(0)

agg_df = daily_df.groupBy('app_id', 'country_code', 'device_code').agg(sum('est_free_app_download').alias('free_app_download'), sum('est_paid_app_download').alias('paid_app_download'), sum('est_revenue').alias('revenue'))
# print agg_df.count()
# agg_df.orderBy(desc('free_app_download')).show(1000)
agg_df.createOrReplaceTempView("agg_df")
top_df = spark.sql("select * from agg_df where country_code='WW' order by free_app_download desc, app_id desc")
top_df.show(1000)


quarterly_df_ho = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-pre-aggr.v3/fact/").where("granularity='yearly' and date='{}' and data_stage='final'".format(end))
# print quarterly_df_ho.count()
# quarterly_df_ho.show()
quarterly_df_ho = quarterly_df_ho.na.fill(0)

quarterly_df_ho.createOrReplaceTempView("quarterly_df")

diff_df1 = spark.sql("select app_id, country_code, device_code, free_app_download, paid_app_download, revenue from agg_df except all select app_id, country_code, device_code, free_app_download, paid_app_download, revenue from quarterly_df")
diff_df2 = spark.sql("select app_id, country_code, device_code, free_app_download, paid_app_download, revenue from quarterly_df except all select app_id, country_code, device_code, free_app_download, paid_app_download, revenue from agg_df")
print diff_df1.count(), diff_df2.count()
# diff_df1.show()
# diff_df2.show()

In [0]:

from pyspark.sql.functions import sum
from pyspark.sql.functions import desc


def get_date_list(start_date, end_date, freq="D"):
    import pandas as pd
    """
    freq:   D: calendar day frequency
            M: month end frequency
            MS: month start frequency
            A, Y: year end frequency
            AS, YS: year start frequency
    """
    date_list = [x.strftime('%Y-%m-%d') for x in list(pd.date_range(start=start_date, end=end_date, freq=freq))]
    return date_list
    

start = '2018-01-01'
end = '2018-12-31'

daily_df = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-load.v3/fact/").where("granularity='daily' and date between '{}' and '{}'".format(start, end))
daily_df = daily_df.na.fill(0)

agg_df = daily_df.groupBy('app_id', 'country_code', 'device_code').agg(sum('est_free_app_download').alias('free_app_download'), sum('est_paid_app_download').alias('paid_app_download'), sum('est_revenue').alias('revenue'))
# print agg_df.count()
# agg_df.orderBy(desc('free_app_download')).show(1000)
agg_df.createOrReplaceTempView("agg_df")
top_df = spark.sql("select * from agg_df where country_code='WW' order by free_app_download desc, app_id desc")
top_df.show(1000)


quarterly_df_ho = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-pre-aggr.v3/fact/").where("granularity='yearly' and date='{}' and data_stage='final'".format(end))
# print quarterly_df_ho.count()
# quarterly_df_ho.show()
quarterly_df_ho = quarterly_df_ho.na.fill(0)

quarterly_df_ho.createOrReplaceTempView("quarterly_df")

diff_df1 = spark.sql("select app_id, country_code, device_code, free_app_download, paid_app_download, revenue from agg_df except all select app_id, country_code, device_code, free_app_download, paid_app_download, revenue from quarterly_df")
diff_df2 = spark.sql("select app_id, country_code, device_code, free_app_download, paid_app_download, revenue from quarterly_df except all select app_id, country_code, device_code, free_app_download, paid_app_download, revenue from agg_df")
print diff_df1.count(), diff_df2.count()
# diff_df1.show()
# diff_df2.show()

In [0]:

from pyspark.sql.functions import sum
from pyspark.sql.functions import desc


def get_date_list(start_date, end_date, freq="D"):
    import pandas as pd
    """
    freq:   D: calendar day frequency
            M: month end frequency
            MS: month start frequency
            A, Y: year end frequency
            AS, YS: year start frequency
    """
    date_list = [x.strftime('%Y-%m-%d') for x in list(pd.date_range(start=start_date, end=end_date, freq=freq))]
    return date_list
    

start = '2017-01-01'
end = '2017-12-31'

daily_df = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-load.v3/fact/").where("granularity='daily' and date between '{}' and '{}'".format(start, end))
daily_df = daily_df.na.fill(0)

agg_df = daily_df.groupBy('app_id', 'country_code', 'device_code').agg(sum('est_free_app_download').alias('free_app_download'), sum('est_paid_app_download').alias('paid_app_download'), sum('est_revenue').alias('revenue'))
# print agg_df.count()
# agg_df.orderBy(desc('free_app_download')).show(1000)
agg_df.createOrReplaceTempView("agg_df")
top_df = spark.sql("select * from agg_df where country_code='WW' order by free_app_download desc, app_id desc")
top_df.show(1000)


quarterly_df_ho = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-pre-aggr.v3/fact/").where("granularity='yearly' and date='{}' and data_stage='final'".format(end))
# print quarterly_df_ho.count()
# quarterly_df_ho.show()
quarterly_df_ho = quarterly_df_ho.na.fill(0)

quarterly_df_ho.createOrReplaceTempView("quarterly_df")

diff_df1 = spark.sql("select app_id, country_code, device_code, free_app_download, paid_app_download, revenue from agg_df except all select app_id, country_code, device_code, free_app_download, paid_app_download, revenue from quarterly_df")
diff_df2 = spark.sql("select app_id, country_code, device_code, free_app_download, paid_app_download, revenue from quarterly_df except all select app_id, country_code, device_code, free_app_download, paid_app_download, revenue from agg_df")
print diff_df1.count(), diff_df2.count()
# diff_df1.show()
# diff_df2.show()

In [0]:

from pyspark.sql.functions import sum
from pyspark.sql.functions import desc


start = '2018-05-06'
end = '2018-05-12'

category_daily_df = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-load.v3/fact/").where("granularity='daily' and date between '{}' and '{}'".format(start, end))
category_id_df = category_daily_df.select('app_id', 'country_code', 'device_code', 'category_id').filter("est_free_app_download is not null").distinct()
category_id_df.createOrReplaceTempView("category_id_df")

est_daily_df = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-load.v3/fact/").where("granularity='daily' and date between '{}' and '{}'".format(start, end))
est_daily_df = est_daily_df.na.fill(0)
est_agg_df = est_daily_df.groupBy('app_id', 'country_code', 'device_code').agg(sum('est_free_app_download').alias('free_app_download'), sum('est_paid_app_download').alias('paid_app_download'), sum('est_revenue').alias('revenue'))
est_agg_df.createOrReplaceTempView("est_agg_df")

agg_df = spark.sql("""select
                        d1.app_id,
                        d1.device_code,
                        d1.country_code,
                        d1.category_id,
                        d2.free_app_download,
                        d2.paid_app_download,
                        d2.revenue
                      FROM category_id_df as d1 
                        JOIN est_agg_df as d2 
                        ON d1.app_id=d2.app_id 
                        AND d1.device_code=d2.device_code 
                        AND d1.country_code=d2.country_code""")
agg_df = agg_df.na.fill(0)
agg_df.createOrReplaceTempView("agg_df")
top_df = spark.sql("select * from agg_df where country_code='WW' order by free_app_download desc, app_id desc")
top_df.show(1000)


quarterly_df_ho = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category.v3/fact/").where("granularity='weekly' and date='{}' and data_stage='final'".format(end)).filter("free_app_download is not null")
# print quarterly_df_ho.count()
# quarterly_df_ho.show()
quarterly_df_ho = quarterly_df_ho.na.fill(0)

quarterly_df_ho.createOrReplaceTempView("quarterly_df")

diff_df1 = spark.sql("select app_id, category_id, country_code, device_code, free_app_download from agg_df except all select app_id, category_id, country_code, device_code, free_app_download from quarterly_df")
diff_df2 = spark.sql("select app_id, category_id, country_code, device_code, free_app_download from quarterly_df except all select app_id, category_id, country_code, device_code, free_app_download from agg_df")
print agg_df.count(), quarterly_df_ho.count()
print diff_df1.count(), diff_df2.count()
diff_df1.show()
diff_df2.show()



In [0]:

start = '2018-05-01'
end = '2018-05-31'
daily_df = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-load.v3/fact/").where("granularity='daily' and date between '{}' and '{}'".format(start, end))
daily_df.createOrReplaceTempView("daily_df")
agg_df = spark.sql("select app_id, device_code, country_code, category_id, sum(est_free_app_download) as free_app_download, sum(est_paid_app_download) as paid_app_download, sum(est_revenue) as revenue from daily_df group by app_id, device_code, country_code, category_id")
agg_df.createOrReplaceTempView("agg_df")
quarterly_df_ho = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category.v3/fact/").where("granularity='monthly' and date='{}' and data_stage='final'".format(end))
quarterly_df_ho.createOrReplaceTempView("quarterly_df")

diff_df1 = spark.sql("select app_id, category_id, country_code, device_code, free_app_download, paid_app_download, revenue from agg_df except all select app_id, category_id, country_code, device_code, free_app_download, paid_app_download, revenue from quarterly_df")
diff_df2 = spark.sql("select app_id, category_id, country_code, device_code, free_app_download, paid_app_download, revenue from quarterly_df except all select app_id, category_id, country_code, device_code, free_app_download, paid_app_download, revenue from agg_df")
print agg_df.count(), quarterly_df_ho.count()
print diff_df1.count(), diff_df2.count()
diff_df1.show()
diff_df2.show()

In [0]:

start = '2018-05-06'
end = '2018-05-12'

category_daily_df = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-load.v3/fact/").where("granularity='daily' and date between '{}' and '{}'".format(start, end))
category_daily_df.createOrReplaceTempView("category_daily_df")
spark.sql(""" select date, app_id, country_code, device_code, category_id,
                    stack(3, 'free_app_download', est_free_app_download, 'paid_app_download', est_paid_app_download, 'revenue', est_revenue) as (metric, est)
                from  category_daily_df""").createOrReplaceTempView("unpivot_category_daily_df")
spark.sql("""select app_id, country_code, device_code, category_id, metric, sum(est) as est
                from unpivot_category_daily_df
                where est is not null
                group by
                app_id,
                country_code,
                device_code,
                category_id,
                metric
""").createOrReplaceTempView("unpivot_category_agg_df")
est_daily_df = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-load.v3/fact/").where("granularity='daily' and date between '{}' and '{}'".format(start, end))
est_daily_df.createOrReplaceTempView("est_daily_df")
spark.sql(""" select app_id, country_code, device_code, 
                stack(3, 'free_app_download', free_app_download, 'paid_app_download', paid_app_download, 'revenue', revenue) as (metric, est)
            FROM(
                select app_id, country_code, device_code, sum(est_free_app_download) as free_app_download, sum(est_paid_app_download) as paid_app_download, sum(est_revenue) as revenue from est_daily_df group by app_id, country_code, device_code)""").createOrReplaceTempView("unpivot_est_agg_df")
spark.sql("""select d1.app_id, d1.country_code, d1.device_code, d1.category_id, d1.metric, d2.est
                from unpivot_category_agg_df as d1 
                join unpivot_est_agg_df as d2 
                on
                d1.app_id=d2.app_id
                and d1.country_code=d2.country_code
                and d1.device_code=d2.device_code
                and d1.metric=d2.metric""").createOrReplaceTempView("unpivot_agg_df")
agg_df = spark.sql(""" select * FROM unpivot_agg_df
                pivot(
                    max(est) FOR metric IN ('free_app_download','revenue', 'paid_app_download')
                )
""")
agg_df.createOrReplaceTempView("agg_df")
quarterly_df_ho = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category.v3/fact/").where("granularity='weekly' and date='{}' and data_stage='final'".format(end))
quarterly_df_ho.createOrReplaceTempView("quarterly_df")
diff_df1 = spark.sql("select app_id, category_id, country_code, device_code, free_app_download, paid_app_download, revenue from agg_df except all select app_id, category_id, country_code, device_code, free_app_download, paid_app_download, revenue from quarterly_df")
diff_df2 = spark.sql("select app_id, category_id, country_code, device_code, free_app_download, paid_app_download, revenue from quarterly_df except all select app_id, category_id, country_code, device_code, free_app_download, paid_app_download, revenue from agg_df")
print agg_df.count(), quarterly_df_ho.count()
print diff_df1.count(), diff_df2.count()
diff_df1.show()
diff_df2.show()



In [0]:

start = '2018-01-01'
end = '2018-12-31'
quarterly_df_ho = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-pre-aggr.v3/fact/").where("granularity='yearly' and date='{}' and data_stage='final'".format(end))
quarterly_df_ho.createOrReplaceTempView("quarterly_df_ho")
# spark.sql("select app_id, device_code, country_code, category_id, free_app_download, revenue, paid_app_download from quarterly_df_ho where app_id=20600005203185 and category_id=400026 and country_code='WW'").show()
spark.sql("select app_id, device_code, country_code, category_id, free_app_download, revenue, paid_app_download from quarterly_df_ho where app_id=20600000013820 and category_id=400026 and country_code='WW'").show()
spark.sql("select app_id, device_code, country_code, category_id, free_app_download, revenue, paid_app_download from quarterly_df_ho where category_id=400026 and country_code='WW' order by free_app_download desc, app_id desc limit 1000").show(1000)


In [0]:

start = '2018-01-01'
end = '2018-04-30'
quarterly_df_ho = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category.v3/fact/").where("granularity='monthly' and date='{}' and data_stage='final'".format(end))
quarterly_df_ho.createOrReplaceTempView("quarterly_df_ho")
# spark.sql("select app_id, device_code, country_code, category_id, free_app_download, revenue, paid_app_download from quarterly_df_ho where app_id=20600005203185 and category_id=400026 and country_code='WW'").show()
# spark.sql("select app_id, device_code, country_code, category_id, free_app_download, revenue, paid_app_download from quarterly_df_ho where app_id=20600000013820 and category_id=400026 and country_code='WW'").show()
spark.sql("select app_id, device_code, country_code, category_id, free_app_download, revenue, paid_app_download from quarterly_df_ho where category_id=400026 and country_code='WW' order by free_app_download desc, app_id desc limit 1000").show(1000)

In [0]:

start = '2018-05-06'
end = '2018-05-12'
daily_df = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-load.v3/fact/").where("granularity='daily' and date between '{}' and '{}'".format(start, end))
daily_df.createOrReplaceTempView("daily_df")
quarterly_df_ho = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category.v3/fact/").where("granularity='weekly' and date='{}' and data_stage='final'".format(end))
quarterly_df_ho.createOrReplaceTempView("quarterly_df")
spark.sql("select app_id,device_code,country_code,category_id,free_app_download from quarterly_df where app_id=1009836739 and country_code='AM' and device_code='ios-phone'").show()
spark.sql("select app_id,device_code,country_code,category_id,est_free_app_download from daily_df where app_id=1009836739 and country_code='AM' and device_code='ios-phone' order by category_id").show()

In [0]:

def get_date_list(start_date, end_date, freq="D"):
    import pandas as pd
    """
    freq:   D: calendar day frequency
            M: month end frequency
            MS: month start frequency
            A, Y: year end frequency
            AS, YS: year start frequency
    """
    date_list = [x.strftime('%Y-%m-%d') for x in list(pd.date_range(start=start_date, end=end_date, freq=freq))]
    return date_list
    
    
from pyspark.sql import types as T
from pyspark.sql import functions as F
csv_schema = T.StructType(
    [
        T.StructField("store_id", T.IntegerType(), True),
        T.StructField("date", T.DateType(), True),
        T.StructField("platform_id", T.IntegerType(), True),
        T.StructField("vertical", T.IntegerType(), True),
        T.StructField("feed", T.IntegerType(), True),
        T.StructField("id", T.LongType(), True),
        T.StructField("est", T.IntegerType(), True),
        T.StructField("category_id", T.IntegerType(), True),
        T.StructField("rank", T.IntegerType(), True)
    ]
)    
start = '2018-05-06'
end = '2018-05-12'    
test_data = get_date_list(start, end)
print test_data
df_1 = spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(csv_schema).csv("s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/ios/sbe_est_app/*/"%",".join(test_data), sep="\t").withColumn("platform", F.lit("ios")).select('id', 'store_id', 'category_id', 'platform_id', 'vertical', 'rank', 'feed', 'est', 'date', 'platform').cache()
df_1.filter("id=1009836739 and platform='ios' and feed='0' and store_id=143524 and rank<=1000").show()

In [0]:

from pyspark.sql.functions import sum
from pyspark.sql.functions import desc

start = '2019-12-29'
end = '2020-01-04'
granularity = 'weekly'

category_daily_df = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-load.v3/fact/").where("granularity='daily' and date between '{}' and '{}'".format(start, end))


agg_df = category_daily_df.groupBy('app_id', 'country_code', 'device_code', 'category_id').agg(
    sum('est_free_app_download').alias('free_app_download'), sum('est_paid_app_download').alias('paid_app_download'), 
        sum('est_revenue').alias('revenue'))
agg_df.createOrReplaceTempView("agg_df")

if granularity == 'weekly' or granularity == 'monthly':
    pre_agg_df = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category.v3/fact/").where("granularity='{}' and date='{}' and data_stage='final'".format(granularity, end))
else:
    pre_agg_df = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-pre-aggr.v3/fact/").where("granularity='{}' and date='{}' and data_stage='final'".format(granularity, end))

pre_agg_df.createOrReplaceTempView("pre_agg_df")

diff_df1 = spark.sql("""select app_id, category_id, country_code, device_code, free_app_download, paid_app_download, revenue 
                            from agg_df 
                        except all 
                        select app_id, category_id, country_code, device_code, free_app_download, paid_app_download, revenue 
                            from pre_agg_df""")
diff_df2 = spark.sql("""select app_id, category_id, country_code, device_code, free_app_download, paid_app_download, revenue 
                            from pre_agg_df 
                        except all 
                        select app_id, category_id, country_code, device_code, free_app_download, paid_app_download, revenue 
                            from agg_df""")
print agg_df.count(), pre_agg_df.count()
print diff_df1.count(), diff_df2.count()
diff_df1.show()
diff_df2.show()

In [0]:

start = '2019-12-29'
end = '2020-01-04'
granularity = 'weekly'

category_daily_df = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/" \
    "store.app-est-category-load.v3/fact/").where("granularity='daily' and date between '{}' and '{}'".format(start, end))
category_daily_df.createOrReplaceTempView("category_daily_df")
spark.sql(""" select date, app_id, country_code, device_code, category_id,
                    stack(3, 'free_app_download', est_free_app_download, 'paid_app_download', est_paid_app_download, 'revenue', est_revenue) 
                    as (metric, est)
                from  category_daily_df""").createOrReplaceTempView("unpivot_category_daily_df")
spark.sql("""select app_id, country_code, device_code, category_id, metric, sum(est) as est
                from unpivot_category_daily_df
                where est is not null
                group by
                app_id,
                country_code,
                device_code,
                category_id,
                metric
""").createOrReplaceTempView("unpivot_category_agg_df")

est_daily_df = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/" \
    "store.app-est-load.v3/fact/").where("granularity='daily' and date between '{}' and '{}'".format(start, end))
est_daily_df.createOrReplaceTempView("est_daily_df")
spark.sql(""" select app_id, country_code, device_code, 
                stack(3, 'free_app_download', free_app_download, 'paid_app_download', paid_app_download, 'revenue', revenue) as (metric, est)
            FROM(
                select app_id, country_code, device_code, 
                sum(est_free_app_download) as free_app_download, 
                sum(est_paid_app_download) as paid_app_download, 
                sum(est_revenue) as revenue 
                from est_daily_df 
                group by app_id, country_code, device_code)""").createOrReplaceTempView("unpivot_est_agg_df")
spark.sql("""select d1.app_id, d1.country_code, d1.device_code, d1.category_id, d1.metric, d2.est
                from unpivot_category_agg_df as d1 
                join unpivot_est_agg_df as d2 
                on
                d1.app_id=d2.app_id
                and d1.country_code=d2.country_code
                and d1.device_code=d2.device_code
                and d1.metric=d2.metric""").createOrReplaceTempView("unpivot_agg_df")
spark.sql(""" select * FROM unpivot_agg_df
                pivot(
                    max(est) FOR metric IN ('free_app_download','revenue', 'paid_app_download')
                )
""").createOrReplaceTempView("agg_df")

spark.sql("""select * from agg_df where category_id=400026 and country_code='WW' and device_code='android-all' order by free_app_download desc, app_id desc limit 1000""").show(1000)

In [0]:

from pyspark.sql.functions import sum
from pyspark.sql.functions import desc

start = '2020-01-01'
end = '2020-01-31'
granularity = 'monthly'

category_daily_df = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-load.v3/fact/").where("granularity='daily' and date between '{}' and '{}'".format(start, end))


agg_df = category_daily_df.groupBy('app_id', 'country_code', 'device_code', 'category_id').agg(
    sum('est_free_app_download').alias('free_app_download'), sum('est_paid_app_download').alias('paid_app_download'), 
        sum('est_revenue').alias('revenue'))
agg_df.createOrReplaceTempView("agg_df")

if granularity == 'weekly' or granularity == 'monthly':
    pre_agg_df = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category.v3/fact/").where("granularity='{}' and date='{}' and data_stage='final'".format(granularity, end))
else:
    pre_agg_df = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-pre-aggr.v3/fact/").where("granularity='{}' and date='{}' and data_stage='final'".format(granularity, end))

pre_agg_df.createOrReplaceTempView("pre_agg_df")

diff_df1 = spark.sql("""select app_id, category_id, country_code, device_code, free_app_download, paid_app_download, revenue 
                            from agg_df 
                        except all 
                        select app_id, category_id, country_code, device_code, free_app_download, paid_app_download, revenue 
                            from pre_agg_df""")
diff_df2 = spark.sql("""select app_id, category_id, country_code, device_code, free_app_download, paid_app_download, revenue 
                            from pre_agg_df 
                        except all 
                        select app_id, category_id, country_code, device_code, free_app_download, paid_app_download, revenue 
                            from agg_df""")
print agg_df.count(), pre_agg_df.count()
print diff_df1.count(), diff_df2.count()
diff_df1.show()
diff_df2.show()

In [0]:

start = '2020-01-01'
end = '2020-01-31'
granularity = 'monthly'

category_daily_df = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/" \
    "store.app-est-category-load.v3/fact/").where("granularity='daily' and date between '{}' and '{}'".format(start, end))
category_daily_df.createOrReplaceTempView("category_daily_df")
spark.sql(""" select date, app_id, country_code, device_code, category_id,
                    stack(3, 'free_app_download', est_free_app_download, 'paid_app_download', est_paid_app_download, 'revenue', est_revenue) 
                    as (metric, est)
                from  category_daily_df""").createOrReplaceTempView("unpivot_category_daily_df")
spark.sql("""select app_id, country_code, device_code, category_id, metric, sum(est) as est
                from unpivot_category_daily_df
                where est is not null
                group by
                app_id,
                country_code,
                device_code,
                category_id,
                metric
""").createOrReplaceTempView("unpivot_category_agg_df")

est_daily_df = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/" \
    "store.app-est-load.v3/fact/").where("granularity='daily' and date between '{}' and '{}'".format(start, end))
est_daily_df.createOrReplaceTempView("est_daily_df")
spark.sql(""" select app_id, country_code, device_code, 
                stack(3, 'free_app_download', free_app_download, 'paid_app_download', paid_app_download, 'revenue', revenue) as (metric, est)
            FROM(
                select app_id, country_code, device_code, 
                sum(est_free_app_download) as free_app_download, 
                sum(est_paid_app_download) as paid_app_download, 
                sum(est_revenue) as revenue 
                from est_daily_df 
                group by app_id, country_code, device_code)""").createOrReplaceTempView("unpivot_est_agg_df")
spark.sql("""select d1.app_id, d1.country_code, d1.device_code, d1.category_id, d1.metric, d2.est
                from unpivot_category_agg_df as d1 
                join unpivot_est_agg_df as d2 
                on
                d1.app_id=d2.app_id
                and d1.country_code=d2.country_code
                and d1.device_code=d2.device_code
                and d1.metric=d2.metric""").createOrReplaceTempView("unpivot_agg_df")
spark.sql(""" select * FROM unpivot_agg_df
                pivot(
                    max(est) FOR metric IN ('free_app_download','revenue', 'paid_app_download')
                )
""").createOrReplaceTempView("agg_df")

spark.sql("""select * from agg_df where category_id=400026 and country_code='WW' and device_code='android-all' order by free_app_download desc, app_id desc limit 1000""").show(1000)

In [0]:

from pyspark.sql.functions import sum
from pyspark.sql.functions import desc

start = '2019-01-01'
end = '2019-12-31'
granularity = 'yearly'

category_daily_df = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-load.v3/fact/").where("granularity='daily' and date between '{}' and '{}'".format(start, end))


agg_df = category_daily_df.groupBy('app_id', 'country_code', 'device_code', 'category_id').agg(
    sum('est_free_app_download').alias('free_app_download'), sum('est_paid_app_download').alias('paid_app_download'), 
        sum('est_revenue').alias('revenue'))
agg_df.createOrReplaceTempView("agg_df")

if granularity == 'weekly' or granularity == 'monthly':
    pre_agg_df = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category.v3/fact/").where("granularity='{}' and date='{}' and data_stage='final'".format(granularity, end))
else:
    pre_agg_df = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-pre-aggr.v3/fact/").where("granularity='{}' and date='{}' and data_stage='final'".format(granularity, end))

pre_agg_df.createOrReplaceTempView("pre_agg_df")

diff_df1 = spark.sql("""select app_id, category_id, country_code, device_code, free_app_download, paid_app_download, revenue 
                            from agg_df 
                        except all 
                        select app_id, category_id, country_code, device_code, free_app_download, paid_app_download, revenue 
                            from pre_agg_df""")
diff_df2 = spark.sql("""select app_id, category_id, country_code, device_code, free_app_download, paid_app_download, revenue 
                            from pre_agg_df 
                        except all 
                        select app_id, category_id, country_code, device_code, free_app_download, paid_app_download, revenue 
                            from agg_df""")
print agg_df.count(), pre_agg_df.count()
print diff_df1.count(), diff_df2.count()
diff_df1.show()
diff_df2.show()

In [0]:

start = '2019-01-01'
end = '2019-12-31'
granularity = 'yearly'

quarterly_category_bucket ="store.app-est-category-pre-aggr.v3"
quarterly_est_bucket= "store.app-est-pre-aggr.v3"
category_daily_df = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/" \
    "{}/fact/".format(quarterly_category_bucket)).where("granularity='{}' and date between '{}' and '{}'".format(granularity, start, end))
category_daily_df.createOrReplaceTempView("category_daily_df1")
spark.sql(""" select date, app_id, country_code, device_code, category_id,
                    free_app_download,paid_app_download, revenue
                from  category_daily_df1""").createOrReplaceTempView("unpivot_category_daily_df1")
est_daily_df = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/" \
    "{}/fact/".format(quarterly_est_bucket)).where("granularity='{}' and date between '{}' and '{}'".format(granularity, start, end))
est_daily_df.createOrReplaceTempView("est_daily_df1")
spark.sql(""" select app_id, country_code, device_code, 
                 free_app_download, paid_app_download, revenue
            FROM est_daily_df1 """).createOrReplaceTempView("unpivot_est_agg_df1")
spark.sql("""select d1.app_id, d1.country_code, d1.device_code, d1.category_id, d2.free_app_download, d2.paid_app_download, d2.revenue
                from unpivot_category_daily_df1 as d1 
                join unpivot_est_agg_df1 as d2 
                on
                d1.app_id=d2.app_id
                and d1.country_code=d2.country_code
                and d1.device_code=d2.device_code
             """).cache().createOrReplaceTempView("unpivot_agg_df1")
spark.sql("""select d1.app_id, d1.country_code, d1.device_code, d1.category_id, 
                case when d1.free_app_download is null then d1.free_app_download else d2.free_app_download end free_app_download, 
                case when d1.paid_app_download is null then d1.paid_app_download else d2.paid_app_download end paid_app_download, 
                case when d1.revenue is null then d1.revenue else d2.revenue end revenue
                from unpivot_category_daily_df1 as d1 
                join unpivot_est_agg_df1 as d2 
                on
                d1.app_id=d2.app_id
                and d1.country_code=d2.country_code
                and d1.device_code=d2.device_code
             """).cache().createOrReplaceTempView("unpivot_agg_df2")

spark.sql("""select * from unpivot_agg_df2 where category_id=400026 and country_code='WW' and device_code='android-all' order by free_app_download desc, app_id desc limit 1000""").show(1000)

In [0]:

start = '2019-12-29'
end = '2020-01-04'
granularity = 'weekly'
category_daily_df = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-load.v3/fact/").where("granularity='daily' and date between '{}' and '{}'".format(start, end))
category_daily_df.filter("app_id=1217351254 and category_id=100071 and country_code='AO' and device_code='ios-phone'").show()
pre_agg_df = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category.v3/fact/").where("granularity='{}' and date='{}' and data_stage='final'".format(granularity, end))
pre_agg_df.filter("app_id=1217351254 and category_id=100071 and country_code='AO' and device_code='ios-phone'").show()

In [0]:
%%sh
aws s3 ls s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-pre-aggr.v3/fact/granularity=yearly/date=2019-12-31/ --recursive | sort -n

In [0]:

def get_date_list(start_date, end_date, freq="D"):
    import pandas as pd
    """
    freq:   D: calendar day frequency
            M: month end frequency
            MS: month start frequency
            A, Y: year end frequency
            AS, YS: year start frequency
    """
    date_list = [x.strftime('%Y-%m-%d') for x in list(pd.date_range(start=start_date, end=end_date, freq=freq))]
    return date_list
    

test_data = get_date_list('2019-09-08', '2019-09-14')
df_1 = spark.read.option("basePath","s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/").parquet("s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/date={%s}/platform=*/*/" %  ",".join(test_data))
df1.filter("")

In [0]:

start = '2019-12-29'
end = '2020-01-04'
granularity = 'weekly'
quarterly_category_bucket ="store.app-est-category.v3"
quarterly_est_bucket= "store.app-est.v3"
category_daily_df = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/" \
    "{}/fact/".format(quarterly_category_bucket)).where("granularity='{}' and date between '{}' and '{}'".format(granularity, start, end))
category_daily_df.createOrReplaceTempView("category_daily_df1")
spark.sql(""" select date, app_id, country_code, device_code, category_id,
                    free_app_download,paid_app_download, revenue
                from  category_daily_df1""").createOrReplaceTempView("unpivot_category_daily_df1")
est_daily_df = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/" \
    "{}/fact/".format(quarterly_est_bucket)).where("granularity='{}' and date between '{}' and '{}'".format(granularity, start, end))
est_daily_df.createOrReplaceTempView("est_daily_df1")
spark.sql(""" select app_id, country_code, device_code, 
                 free_app_download, paid_app_download, revenue
            FROM est_daily_df1 """).createOrReplaceTempView("unpivot_est_agg_df1")
spark.sql("""select d1.app_id, d1.country_code, d1.device_code, d1.category_id, d2.free_app_download, d2.paid_app_download, d2.revenue
                from unpivot_category_daily_df1 as d1 
                join unpivot_est_agg_df1 as d2 
                on
                d1.app_id=d2.app_id
                and d1.country_code=d2.country_code
                and d1.device_code=d2.device_code
             """).cache().createOrReplaceTempView("unpivot_agg_df1")
spark.sql("""select d1.app_id, d1.country_code, d1.device_code, d1.category_id, 
                case when d1.free_app_download is null then d1.free_app_download else d2.free_app_download end free_app_download, 
                case when d1.paid_app_download is null then d1.paid_app_download else d2.paid_app_download end paid_app_download, 
                case when d1.revenue is null then d1.revenue else d2.revenue end revenue
                from unpivot_category_daily_df1 as d1 
                join unpivot_est_agg_df1 as d2 
                on
                d1.app_id=d2.app_id
                and d1.country_code=d2.country_code
                and d1.device_code=d2.device_code
             """).cache().createOrReplaceTempView("unpivot_agg_df2")

In [0]:

start = '2019-12-29'
end = '2020-01-04'
granularity = 'weekly'

category_daily_df = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/" \
    "store.app-est-category-load.v3/fact/").where("granularity='daily' and date between '{}' and '{}'".format(start, end))
category_daily_df.createOrReplaceTempView("category_daily_df")
spark.sql(""" select date, app_id, country_code, device_code, category_id,
                    stack(3, 'free_app_download', est_free_app_download, 'paid_app_download', est_paid_app_download, 'revenue', est_revenue) 
                    as (metric, est)
                from  category_daily_df""").createOrReplaceTempView("unpivot_category_daily_df")
spark.sql("""select app_id, country_code, device_code, category_id, metric, sum(est) as est
                from unpivot_category_daily_df
                where est is not null
                group by
                app_id,
                country_code,
                device_code,
                category_id,
                metric
""").createOrReplaceTempView("unpivot_category_agg_df")

est_daily_df = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/" \
    "store.app-est-load.v3/fact/").where("granularity='daily' and date between '{}' and '{}'".format(start, end))
est_daily_df.createOrReplaceTempView("est_daily_df")
spark.sql(""" select app_id, country_code, device_code, 
                stack(3, 'free_app_download', free_app_download, 'paid_app_download', paid_app_download, 'revenue', revenue) as (metric, est)
            FROM(
                select app_id, country_code, device_code, 
                sum(est_free_app_download) as free_app_download, 
                sum(est_paid_app_download) as paid_app_download, 
                sum(est_revenue) as revenue 
                from est_daily_df 
                group by app_id, country_code, device_code)""").createOrReplaceTempView("unpivot_est_agg_df")
spark.sql("""select d1.app_id, d1.country_code, d1.device_code, d1.category_id, d1.metric, d2.est
                from unpivot_category_agg_df as d1 
                join unpivot_est_agg_df as d2 
                on
                d1.app_id=d2.app_id
                and d1.country_code=d2.country_code
                and d1.device_code=d2.device_code
                and d1.metric=d2.metric""").createOrReplaceTempView("unpivot_agg_df")
spark.sql(""" select * FROM unpivot_agg_df
                pivot(
                    max(est) FOR metric IN ('free_app_download','revenue', 'paid_app_download')
                )
""").cache().createOrReplaceTempView("agg_df")


In [0]:

diff_df = spark.sql("""select app_id, category_id, country_code, device_code, free_app_download, paid_app_download, revenue 
                            from agg_df 
                        except all 
                        select app_id, category_id, country_code, device_code, free_app_download, paid_app_download, revenue 
                            from unpivot_agg_df2
                        order by free_app_download desc""")
diff_df.show(999)

In [0]:

spark.sql("select * from unpivot_est_agg_df where app_id=20600011926298 and country_code='WW' and device_code='android-all'").show()
spark.sql("select * from unpivot_category_agg_df where app_id=20600011926298 and country_code='WW' and device_code='android-all'").show()
spark.sql("select * from unpivot_agg_df where app_id=20600011926298 and country_code='WW' and device_code='android-all'").show()

In [0]:

spark.sql("select * from unpivot_agg_df2 where app_id=20600011926298 and category_id=400000 and country_code='WW' and device_code='android-all'").show()

spark.sql("select * from agg_df where app_id=20600011926298 and country_code='WW' and device_code='android-all'").show()

In [0]:

spark.sql("select * from category_daily_df where app_id=20600011926298 and category_id=400000 and country_code='WW' and device_code='android-all'").show()

In [0]:

from bdce.common.utils import update_application_code
update_application_code(spark, role="BDP-PROD-APP-INT-QA", application_name="zidong-application-autopipeline")

spark.sparkContext.addPyFile("/tmp/zeppelin_application_code/libs/python/dependencies.zip")
import aaplproxy
from applications.auto_pipeline.temp_script.utils.base_test import PipelineTest

In [0]:

import aaplproxy
import datetime
from dateutil.relativedelta import relativedelta
from aadatapipelinecore.core.urn import Urn
from aadatapipelinecore.core.pipeline import type_
from applications.common.parser import SqlParser
from applications.common.executor import SqlExecutor
from applications.auto_pipeline.temp_script.utils.base_test import PipelineTest
from applications.auto_pipeline.transform import _view