In [0]:

res = spark.read.csv("s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/2019-12-01/android/market_size/*",sep="\t").createOrReplaceTempView("raw_data")
spark.sql("select * from raw_data").show()

In [0]:

from pyspark.sql.functions import lit, udf
from pyspark.sql.types import (
    ArrayType, BooleanType, LongType, IntegerType,
    StringType, StructType, StructField
)
schema = StructType([
    StructField("store_id", IntegerType(), False),
    StructField("date", StringType(), False),
    StructField("platform_id", IntegerType(), False),
    StructField("device", StringType(), False),
    StructField("data_type", StringType(), False),
    StructField("price_type", IntegerType(), False),
    StructField("purchase_type", IntegerType(), False),
    StructField("category_id", IntegerType(), False),
    StructField("estimate", LongType(), False)
])
sql = """
    SELECT store_id, date, price_type as app_price_type_id, purchase_type as purchase_type_id, category_id,
    device_code, downloads as est_market_size_download, revenue as est_market_size_revenue, 'daily' as granularity
    FROM (
        SELECT *,'ios-all' AS device_code FROM raw_data WHERE platform_id = 1 AND device='ios'
            UNION ALL
        SELECT *,'ios-phone' AS device_code FROM raw_data WHERE platform_id = 1 AND device='iphone'
            UNION ALL
        SELECT *,'ios-tablet' AS device_code FROM raw_data WHERE platform_id = 1 AND device='ipad'
            UNION ALL
        SELECT *,'android-all' AS device_code FROM raw_data WHERE platform_id = 0 AND device='google-play'
    ) PIVOT (FIRST(estimate) for data_type in ('downloads', 'revenue'))
"""
spark.read.csv("s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/2020-01-04/ios/market_size/*", schema=schema, sep='\t').limit(10).createOrReplaceTempView("raw_data")
# res = spark.sql("select * from raw_data PIVOT (AVG(estimate) for data_type in ('downloads', 'revenue'))")
# res.show()
spark.sql(sql).show()

In [0]:

from pyspark.sql.functions import lit, udf
from pyspark.sql.types import (
    ArrayType, BooleanType, LongType, IntegerType,
    StringType, StructType, StructField
)
schema = StructType([
    StructField("store_id", IntegerType(), False),
    StructField("date", StringType(), False),
    StructField("platform_id", IntegerType(), False),
    StructField("device", StringType(), False),
    StructField("data_type", StringType(), False),
    StructField("price_type", IntegerType(), False),
    StructField("purchase_type", IntegerType(), False),
    StructField("category_id", IntegerType(), False),
    StructField("estimate", LongType(), False)
])
sql = """
SELECT store_id, date, price_type as app_price_type_id, purchase_type as purchase_type_id, category_id AS legacy_category_id,
    device_code, downloads as est_market_size_download, revenue as est_market_size_revenue, 'daily' as granularity
    FROM (
        SELECT *,'android-all' AS device_code FROM raw_data WHERE platform_id = 0 AND device='google-play'
    ) PIVOT (FIRST(estimate) for data_type in ('downloads', 'revenue'))
"""
spark.read.csv("s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/2020-01-10/android/market_size/*", schema=schema, sep='\t').createOrReplaceTempView("raw_data")
# res = spark.sql("select * from raw_data PIVOT (FIRST(estimate) for data_type in ('downloads', 'revenue'))")
# res.show()
#spark.sql(sql).filter("store_id = 10 and app_price_type_id=0 and purchase_type_id=10 and device_code='android-all' and category_id=38").show()
spark.sql(sql).filter("store_id = 10 and app_price_type_id=0 and purchase_type_id=10 and device_code='android-all' and category_id=38").show()

In [0]:

from pyspark.sql.functions import lit, udf
from pyspark.sql.types import (
    ArrayType, BooleanType, LongType, IntegerType,
    StringType, StructType, StructField
)
schema = StructType([
    StructField("store_id", IntegerType(), False),
    StructField("date", StringType(), False),
    StructField("platform_id", IntegerType(), False),
    StructField("device", StringType(), False),
    StructField("data_type", StringType(), False),
    StructField("price_type", IntegerType(), False),
    StructField("purchase_type", IntegerType(), False),
    StructField("category_id", IntegerType(), False),
    StructField("estimate", LongType(), False)
])
path = "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.market-size.v1/fact/granularity=daily/date=2020-01-10/"
unified = spark.read.parquet(path)
raw_android = spark.read.csv("s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/2020-01-10/android/market_size/*", schema=schema, sep='\t')
raw_ios = spark.read.csv("s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/2020-01-10/ios/market_size/*", schema=schema, sep='\t')
#print dir(unified)
unified_df = unified.toPandas()
raw_android_df = raw_android.toPandas()
raw_ios_df = raw_ios.toPandas()
# country check
print len(unified_df['country_code'].unique()) # 157
# category check
print (unified_df['category_id'].unique())
print (raw_ios_df['category_id'].unique())
print (raw_android_df['category_id'].unique())
print len(unified_df['category_id'].unique())
print len(raw_ios_df['category_id'].unique())
print len(raw_android_df['category_id'].unique())
print len(list(set(raw_ios_df['category_id'].unique().tolist() + raw_android_df['category_id'].unique().tolist())))
# device check
print (unified_df['device_code'].unique())
# app_price_type_id
print (unified_df['app_price_type_id'].unique())
# purchase_type_id
print (unified_df['purchase_type_id'].unique())

In [0]:
%%sh
#aws s3 ls  "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.market-size.v1/fact/granularity=daily/date=2020-01-10/"
aws s3 ls  "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/2020-01-10/"

In [0]:

from pyspark.sql.functions import lit, udf
from pyspark.sql.types import (
    ArrayType, BooleanType, LongType, IntegerType,
    StringType, StructType, StructField
)
schema = StructType([
    StructField("store_id", IntegerType(), False),
    StructField("date", StringType(), False),
    StructField("platform_id", IntegerType(), False),
    StructField("device", StringType(), False),
    StructField("data_type", StringType(), False),
    StructField("price_type", IntegerType(), False),
    StructField("purchase_type", IntegerType(), False),
    StructField("category_id", IntegerType(), False),
    StructField("estimate", LongType(), False)
])
# check metrics value
spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.market-size.v1/fact/granularity=daily/date=2020-01-10/").filter("country_code='US' and app_price_type_id=0 and purchase_type_id=10 and device_code='android-all' and category_id=400003").show()
spark.read.csv("s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/2020-01-10/android/market_size/10", schema=schema, sep='\t').filter("price_type=0 and purchase_type=10 and category_id=39 and device='google-play'").show()
# check dimension