In [0]:

from pyspark.sql import functions as F
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf

ANDROID_CATEGORIES = [
    (1, 400000), (2, 400001), (3, 400022), (4, 400023), (5, 400024),
    (6, 400008), (7, 400011), (8, 400014), (9, 400017), (10, 400020),
    (11, 400025), (12, 400030), (13, 400031), (14, 400032), (15, 400033),
    (16, 400035), (17, 400036), (18, 400038), (19, 400040), (20, 400042),
    (21, 400043), (22, 400044), (23, 400058), (24, 400046), (25, 400047),
    (26, 400048), (27, 400050), (28, 400051), (29, 400052), (30, 400053),
    (31, 400054), (32, 400055), (33, 400056), (34, 400045), (35, 400057),
    (36, 400059), (37, 400060), (38, 400002), (39, 400003), (40, 400021),
    (41, 400004), (42, 400005), (43, 400006), (44, 400007), (46, 400009),
    (47, 400010), (48, 400012), (49, 400013), (51, 400015), (52, 400016),
    (54, 400018), (55, 400019), (56, 400061), (57, 400063), (58, 400064),
    (59, 400065), (60, 400062), (61, 400066), (62, 400067), (63, 400068),
    (64, 400069), (65, 400070), (66, 400026), (67, 400027), (68, 400041),
    (69, 400028), (70, 400029), (71, 400034), (72, 400037), (73, 400039),
    (75, 400049)
]
IOS_CATEGORIES = [
    (36, 100000), (100, 100021), (360, 100030), (361, 100031), (362, 100032),
    (363, 100033), (6000, 100023), (6001, 100077), (6002, 100076), (6003, 100075),
    (6004, 100073), (6005, 100072), (6006, 100070), (6007, 100069), (6008, 100068),
    (6009, 100067), (6010, 100066), (6011, 100065), (6012, 100034), (6013, 100029),
    (6014, 100001), (6015, 100027), (6016, 100026), (6017, 100025), (6018, 100022),
    (6020, 100064), (6021, 100035), (6022, 100024), (6023, 100028), (6024, 100071),
    (6025, 100074), (7001, 100002), (7002, 100003), (7003, 100004), (7004, 100005),
    (7005, 100006), (7006, 100007), (7007, 100008), (7008, 100009), (7009, 100010),
    (7010, 100011), (7011, 100012), (7012, 100013), (7013, 100014), (7014, 100015),
    (7015, 100016), (7016, 100017), (7017, 100018), (7018, 100019), (7019, 100020),
    (13001, 100053), (13002, 100046), (13003, 100049), (13004, 100054), (13005, 100060),
    (13006, 100037), (13007, 100036), (13008, 100038), (13009, 100039), (13010, 100040),
    (13011, 100041), (13012, 100042), (13013, 100043), (13014, 100044), (13015, 100045),
    (13017, 100047), (13018, 100048), (13019, 100050), (13020, 100051), (13021, 100052),
    (13023, 100055), (13024, 100056), (13025, 100057), (13026, 100058), (13027, 100059),
    (13028, 100061), (13029, 100062), (13030, 100063)
]
IOS_STORE_COUNTRY_MAPPING = [
    (0, 'WW'), (143575, 'AL'), (143563, 'DZ'), (143564, 'AO'), (143538, 'AI'),
    (143540, 'AG'), (143505, 'AR'), (143524, 'AM'), (143460, 'AU'), (143445, 'AT'),
    (143568, 'AZ'), (143539, 'BS'), (143559, 'BH'), (143541, 'BB'), (143565, 'BY'),
    (143446, 'BE'), (143555, 'BZ'), (143576, 'BJ'), (143542, 'BM'), (143577, 'BT'),
    (143556, 'BO'), (143525, 'BW'), (143503, 'BR'), (143543, 'VG'), (143560, 'BN'),
    (143526, 'BG'), (143578, 'BF'), (143579, 'KH'), (143455, 'CA'), (143580, 'CV'),
    (143544, 'KY'), (143581, 'TD'), (143483, 'CL'), (143465, 'CN'), (143501, 'CO'),
    (143582, 'CG'), (143495, 'CR'), (143494, 'HR'), (143557, 'CY'), (143489, 'CZ'),
    (143458, 'DK'), (143545, 'DM'), (143508, 'DO'), (143509, 'EC'), (143516, 'EG'),
    (143506, 'SV'), (143518, 'EE'), (143583, 'FJ'), (143447, 'FI'), (143442, 'FR'),
    (143584, 'GM'), (143443, 'DE'), (143573, 'GH'), (143448, 'GR'), (143546, 'GD'),
    (143504, 'GT'), (143585, 'GW'), (143553, 'GY'), (143510, 'HN'), (143463, 'HK'),
    (143482, 'HU'), (143558, 'IS'), (143467, 'IN'), (143476, 'ID'), (143449, 'IE'),
    (143491, 'IL'), (143450, 'IT'), (143511, 'JM'), (143462, 'JP'), (143528, 'JO'),
    (143517, 'KZ'), (143529, 'KE'), (143493, 'KW'), (143586, 'KG'), (143587, 'LA'),
    (143519, 'LV'), (143497, 'LB'), (143588, 'LR'), (143520, 'LT'), (143451, 'LU'),
    (143515, 'MO'), (143530, 'MK'), (143531, 'MG'), (143589, 'MW'), (143473, 'MY'),
    (143532, 'ML'), (143521, 'MT'), (143590, 'MR'), (143533, 'MU'), (143468, 'MX'),
    (143591, 'FM'), (143523, 'MD'), (143592, 'MN'), (143547, 'MS'), (143593, 'MZ'),
    (143594, 'NA'), (143484, 'NP'), (143452, 'NL'), (143461, 'NZ'), (143512, 'NI'),
    (143534, 'NE'), (143561, 'NG'), (143457, 'NO'), (143562, 'OM'), (143477, 'PK'),
    (143595, 'PW'), (143485, 'PA'), (143597, 'PG'), (143513, 'PY'), (143507, 'PE'),
    (143474, 'PH'), (143478, 'PL'), (143453, 'PT'), (143498, 'QA'), (143487, 'RO'),
    (143469, 'RU'), (143598, 'ST'), (143479, 'SA'), (143535, 'SN'), (143599, 'SC'),
    (143600, 'SL'), (143464, 'SG'), (143496, 'SK'), (143499, 'SI'), (143601, 'SB'),
    (143472, 'ZA'), (143466, 'KR'), (143454, 'ES'), (143486, 'LK'), (143548, 'KN'),
    (143549, 'LC'), (143550, 'VC'), (143554, 'SR'), (143602, 'SZ'), (143456, 'SE'),
    (143459, 'CH'), (143470, 'TW'), (143603, 'TJ'), (143572, 'TZ'), (143475, 'TH'),
    (143551, 'TT'), (143536, 'TN'), (143480, 'TR'), (143604, 'TM'), (143552, 'TC'),
    (143537, 'UG'), (143492, 'UA'), (143481, 'AE'), (143444, 'GB'), (143441, 'US'),
    (143514, 'UY'), (143566, 'UZ'), (143502, 'VE'), (143471, 'VN'), (143571, 'YE'),
    (143605, 'ZW'),(143518, 'EE')]
ANDROID_STORE_COUNTRY_MAPPING = [
    (17, 'AR'), (1, 'AU'), (35, 'AT'), (61, 'AZ'), (11, 'BE'), (18, 'BR'), (47, 'BG'),
    (2, 'CA'), (13, 'CL'), (3, 'CN'), (52, 'CO'), (64, 'CR'), (80, 'HR'), (36, 'CZ'),
     (38, 'DK'), (62, 'EC'), (33, 'EG'), (20, 'FI'), (6, 'FR'), (4, 'DE'), (46, 'GR'),
   (16, 'HK'), (37, 'HU'), (19, 'IN'), (21, 'ID'), (39, 'IE'), (40, 'IL'), (8, 'IT'),
   (9, 'JP'), (53, 'KZ'), (95, 'KE'), (50, 'KW'), (86, 'LV'), (65, 'LB'), (78, 'LT'),
   (24, 'MY'), (26, 'MX'), (23, 'NL'), (41, 'NZ'), (74, 'NG'), (42, 'NO'), (54, 'PK'),
     (56, 'PE'), (31, 'PH'), (28, 'PL'), (43, 'PT'), (84, 'PR'), (73, 'QA'), (44, 'RO'),
    (22, 'RU'), (51, 'SA'), (32, 'SG'), (45, 'SK'), (14, 'ZA'), (27, 'KR'), (5, 'ES'),
     (34, 'SE'), (12, 'CH'), (30, 'TW'), (29, 'TH'), (25, 'TR'), (48, 'UA'), (49, 'AE'),
     (7, 'GB'), (10, 'US'), (15, 'VN'), (1000, 'WW')
 ]
def device_code_to_feed(market_code, device_code, metric_name):
    mapping = [
        ['apple-store',0,'ios-phone','est_free_app_download'],
        ['apple-store',1,'ios-phone','est_paid_app_download'],
        ['apple-store',2,'ios-phone','est_revenue'],
        ['apple-store',101,'ios-tablet','est_free_app_download'],
        ['apple-store',100,'ios-tablet','est_paid_app_download'],
        ['apple-store',102,'ios-tablet','est_revenue'],
        ['google-play',0,'android-all','est_free_app_download'],
        ['google-play',1,'android-all','est_paid_app_download'],
        ['google-play',2,'android-all','est_revenue'],
    ]
    return [x for x in mapping if (x[0], x[2], x[3]) == (market_code, device_code, metric_name)][0][1]
def country_code_to_id(market_code, code):
    if market_code == 'apple-store':
        ios_mapping = {_code:_id for (_id, _code) in IOS_STORE_COUNTRY_MAPPING}
        return ios_mapping[code]
    else:
        gp_mapping = {_code:_id for (_id, _code) in ANDROID_STORE_COUNTRY_MAPPING}
        return gp_mapping[code]
def category_to_legacy_category(market_code, legacy):
    if market_code == 'apple-store':
        ios_category = {_category:_legacy_category for (_legacy_category,_category) in IOS_CATEGORIES }
        return ios_category[legacy]
    else:
        gp_category =  {_category:_legacy_category for (_legacy_category,_category) in ANDROID_CATEGORIES }
        return gp_category[legacy]
        
        
def country_code_to_id(device_code, country_code):
    if 'ios' in device_code :
        ios_mapping = {_code:_id for (_id, _code) in IOS_STORE_COUNTRY_MAPPING}
        return ios_mapping[country_code]
    else:
        gp_mapping = {_code:_id for (_id, _code) in ANDROID_STORE_COUNTRY_MAPPING}
        return gp_mapping[country_code]
        
        
def device_code_to_platform(device_code):
    if 'ios' in device_code :
        return 'ios'
    else:
        return 'android-all'
# 13028, 100061
# 71, 400034
# category_to_legacy_category("apple-store",100000)



In [0]:

from pyspark.sql import functions as F
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf
to_country_code = udf(lambda x, y : country_code_to_id(x,y), StringType())

# print spark.read.parquet("s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/date=2020-02-08/").show(2)
# print spark.read.parquet("s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=MONTH/date=2020-02-29/").filter("free is not null").filter("id = '20600000303226' and platform='android' and store_id=4 and category='11'")
# "app_id='20600012619922' and category_id='400025' and country_code='AR'
# print spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-rank.v1/fact/granularity=weekly/date=2020-02-08/").show(2)
# def country_code_to_id(market_code, code):

to_country_code = udf(lambda x, y : country_code_to_id(x,y), StringType())
to_platform = udf(lambda x : device_code_to_platform(x), StringType())
df1 = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=weekly/date=2020-02-08/").cache()
df2 = spark.read.parquet("s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=WEEK/date=2020-02-08/platform=ios").cache()
# df1.filter("country_code = 'US'").show()
# df2.filter("store_id={}".format(country_code_to_id("ios","US"))).show()

# df2 = df2.select("id","store_id","free", "paid","revenue","platform").cache()
# df1.show(2)
df_filter = df1.filter("device_code='ios-tablet'").select(F.col("app_id").alias("id"), to_country_code("device_code", "country_code").alias("store_id"), F.col("free_app_download").alias("ipad_free") , F.col("paid_app_download").alias("ipad_paid"), F.col("revenue").alias("ipad_revenue") ).cache()
# df_filter.show()
# df2.select("id","store_id","iphone_free","iphone_paid","iphone_revenue").show()
# df1.select("device_code", to_platform("device_code").alias("platform")).show()
df_filter.createOrReplaceTempView("unified")
df2.select("id","store_id","ipad_free","ipad_paid","ipad_revenue").filter("store_id not in (3,4,5,6,1003,1004,1005,1006) and not (ipad_free==0 and ipad_paid==0 and ipad_revenue==0 )").distinct().createOrReplaceTempView("raw")

print spark.sql("select * from unified except all select distinct * from raw ").show(2)
print spark.sql("select distinct * from raw except all select distinct * from unified").show(2)

# spark.sql("select * from unified  full outer join raw on unified.id=raw.id and unified.store_id=raw.store_id where unified.id is null or raw.id is null ").show(600)
spark.sql("select * from unified  full outer join raw on unified.id=raw.id and unified.store_id=raw.store_id where unified.id is null or raw.id is null and (unified.ipad_free != raw.ipad_free  and unified.ipad_paid != raw.ipad_paid and unified.ipad_revenue != raw.ipad_revenue)  ").show(6)


# +---------+--------+-----------+-----------+--------------+
# |       id|store_id|iphone_free|iphone_paid|iphone_revenue|
# +---------+--------+-----------+-----------+--------------+
# |944638307|  143456|          7|       null|            22|
# |562191381|  143456|          8|       null|            48|
# +---------+--------+-----------+-----------+--------------+
# only showing top 2 rows

# None
# +----------+--------+-----------+-----------+--------------+
# |        id|store_id|iphone_free|iphone_paid|iphone_revenue|
# +----------+--------+-----------+-----------+--------------+
# | 672309140|  143454|          4|          0|            13|
# |1101290388|  143470|          0|          0|            26|
# +----------+--------+-----------+-----------+--------------+


In [0]:

df1.filter("app_id='953929401' and country_code='MZ'").show()
df_filter.filter("store_id='143593' and id='953929401'").show()

print spark.sql("select * from unified where store_id='143593' and id='953929401'  ").show(2)
print spark.sql("select * from raw where store_id='143593' and id='953929401'").show(2)


In [0]:

from pyspark.sql import functions as F
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf


to_country_code = udf(lambda x, y : country_code_to_id(x,y), StringType())
to_platform = udf(lambda x : device_code_to_platform(x), StringType())
df1 = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=weekly/date=2020-02-08/").cache()
df2 = spark.read.parquet("s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=WEEK/date=2020-02-08/platform=ios").cache()
df_filter = df1.filter("device_code='ios-tablet'").select(F.col("app_id").alias("id"), to_country_code("device_code", "country_code").alias("store_id"), F.col("free_app_download").alias("ipad_free") , F.col("paid_app_download").alias("ipad_paid"), F.col("revenue").alias("ipad_revenue") ).cache()
df_filter.createOrReplaceTempView("unified")
df2.select("id","store_id","ipad_free","ipad_paid","ipad_revenue").filter("store_id not in (3,4,5,6,1003,1004,1005,1006) and not (ipad_free==0 and ipad_paid==0 and ipad_revenue==0 )").distinct().createOrReplaceTempView("raw")

print spark.sql("select * from unified except all select distinct * from raw ").show(2)
print spark.sql("select distinct * from raw except all select distinct * from unified").show(2)

spark.sql("select * from unified  full outer join raw on unified.id=raw.id and unified.store_id=raw.store_id where unified.id is null or raw.id is null and (unified.ipad_free != raw.ipad_free  and unified.ipad_paid != raw.ipad_paid and unified.ipad_revenue != raw.ipad_revenue)  ").show(6)


# +---------+--------+-----------+-----------+--------------+
# |       id|store_id|iphone_free|iphone_paid|iphone_revenue|
# +---------+--------+-----------+-----------+--------------+
# |944638307|  143456|          7|       null|            22|
# |562191381|  143456|          8|       null|            48|
# +---------+--------+-----------+-----------+--------------+
# only showing top 2 rows

# None
# +----------+--------+-----------+-----------+--------------+
# |        id|store_id|iphone_free|iphone_paid|iphone_revenue|
# +----------+--------+-----------+-----------+--------------+
# | 672309140|  143454|          4|          0|            13|
# |1101290388|  143470|          0|          0|            26|
# +----------+--------+-----------+-----------+--------------+




In [0]:
%%sh

aws s3 ls s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date=2015-03-01/
aws s3 ls s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=monthly/date=2015-03-31/



In [0]:


df = spark.read.option("basePath", "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/").parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date=2015-09-*/").cache()
df.createOrReplaceTempView("unified_daily")
df_week = spark.sql("select app_id, country_code, sum(free_app_download) as free_app_download , sum(paid_app_download) as paid_app_download  , sum(revenue) as revenue, device_code from unified_daily group by country_code,device_code,app_id ")


In [0]:

df1 = spark.read.option("basePath", "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/").parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=monthly/date=2015-09-*/").cache()
df1.createOrReplaceTempView("unified_monthly")
df1.show()
df_week.createOrReplaceTempView("unified_sum")

unfiied_sum_except = spark.sql("select app_id,country_code,free_app_download,paid_app_download,revenue,device_code from unified_sum except all select app_id,country_code,free_app_download,paid_app_download,revenue,device_code from unified_monthly")
unfied_month_except = spark.sql("select app_id,country_code,free_app_download,paid_app_download,revenue,device_code from  unified_monthly except all select app_id,country_code,free_app_download,paid_app_download,revenue,device_code from unified_sum ")



print unfiied_sum_except.cache()
print unfied_month_except.cache()
unfiied_sum_except.select("device_code").distinct().show()
unfied_month_except.select("device_code").distinct().show()



In [0]:


unfiied_sum_except.filter("app_id=20600001181828").show()
unfied_month_except.filter("app_id=20600001181828").show()


In [0]:


spark.sql("select * from unified_sum where device_code='ios-phone'").show()


In [0]:
%%sh

aws s3 ls s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date=2015-08 --recursive
echo 'hahaha'
aws s3 ls s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=weekly/date=2015-08 --recursive
echo 'hahaha'
aws s3 ls s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=monthly/date=2015-08 --recursive

# aws s3 ls s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date=2015-08 --recursive

# aws s3 ls s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=weekly/date=2015-08 --recursive


In [0]:

from pyspark.sql import functions as F
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf

# print spark.read.parquet("s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/date=2020-02-08/").show(2)
# print spark.read.parquet("s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=MONTH/date=2020-02-29/").filter("free is not null").filter("id = '20600000303226' and platform='android' and store_id=4 and category='11'")
# "app_id='20600012619922' and category_id='400025' and country_code='AR'
# print spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-rank.v1/fact/granularity=weekly/date=2020-02-08/").show(2)
# def country_code_to_id(market_code, code):

to_country_code = udf(lambda x, y : country_code_to_id(x,y), StringType())
to_platform = udf(lambda x : device_code_to_platform(x), StringType())
df1 = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=monthly/date=2020-02-29/").cache()
df2 = spark.read.parquet("s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=MONTH/date=2020-02-29/platform=ios").cache()
# df1.filter("country_code = 'US'").show()
# df2.filter("store_id={}".format(country_code_to_id("ios","US"))).show()

# df2 = df2.select("id","store_id","free", "paid","revenue","platform").cache()
# df1.show(2)
df_filter = df1.filter("device_code='ios-tablet'").select(F.col("app_id").alias("id"), to_country_code("device_code", "country_code").alias("store_id"), F.col("free_app_download").alias("ipad_free") , F.col("paid_app_download").alias("ipad_paid"), F.col("revenue").alias("ipad_revenue") ).cache()
# df_filter.show()
# df2.select("id","store_id","iphone_free","iphone_paid","iphone_revenue").show()
# df1.select("device_code", to_platform("device_code").alias("platform")).show()
df_filter.createOrReplaceTempView("unified")
df2.select("id","store_id","ipad_free","ipad_paid","ipad_revenue").filter("store_id not in (3,4,5,6,1003,1004,1005,1006) and not (ipad_free==0 and ipad_paid==0 and ipad_revenue==0 )").distinct().createOrReplaceTempView("raw")

print spark.sql("select * from unified except all select distinct * from raw ").show(2)
print spark.sql("select distinct * from raw except all select distinct * from unified").show(2)

# spark.sql("select * from unified  full outer join raw on unified.id=raw.id and unified.store_id=raw.store_id where unified.id is null or raw.id is null ").show(600)
spark.sql("select * from unified  full outer join raw on unified.id=raw.id and unified.store_id=raw.store_id where unified.id is null or raw.id is null and (unified.ipad_free != raw.ipad_free  and unified.ipad_paid != raw.ipad_paid and unified.ipad_revenue != raw.ipad_revenue)  ").show(6)


# +---------+--------+-----------+-----------+--------------+
# |       id|store_id|iphone_free|iphone_paid|iphone_revenue|
# +---------+--------+-----------+-----------+--------------+
# |944638307|  143456|          7|       null|            22|
# |562191381|  143456|          8|       null|            48|
# +---------+--------+-----------+-----------+--------------+
# only showing top 2 rows

# None
# +----------+--------+-----------+-----------+--------------+
# |        id|store_id|iphone_free|iphone_paid|iphone_revenue|
# +----------+--------+-----------+-----------+--------------+
# | 672309140|  143454|          4|          0|            13|
# |1101290388|  143470|          0|          0|            26|
# +----------+--------+-----------+-----------+--------------+
