In [0]:

spark.sparkContext.addPyFile("/home/hadoop/bdp/application/libs/python/dependencies.zip")
import pandas as pd
pd.set_option('expand_frame_repr', False)


In [0]:


import pandas as pd
from pyspark.sql import functions as F


mapping_df_unified = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-dna/unified/dna.genre_id_product_mapping.v1/dimension/")
transformed_mapping_df = mapping_df_unified.select(["product_id", "genre_id"]).withColumn('genre_id', F.explode('genre_id'))

def compare(date):
    #collect
    store_est_unified_df = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date={}/".format(date))
    est_unified_df = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.genre-est.v1/fact/granularity=daily/date={}/".format(date))
    
    #transform
    transformed_store_est_unified_df = store_est_unified_df.select(["app_id", "device_code", "country_code", "free_app_download", "paid_app_download", "revenue"]).withColumnRenamed("app_id", "product_id")
    giq_df = transformed_store_est_unified_df.join(transformed_mapping_df, transformed_mapping_df.product_id == transformed_store_est_unified_df.product_id, how='inner').groupBy(["device_code", "country_code", "genre_id"]).agg({
            "free_app_download": "sum",
            "paid_app_download": "sum",
            "revenue": "sum",
        }).withColumnRenamed("sum(free_app_download)", "free_app_download").withColumnRenamed("sum(paid_app_download)", "paid_app_download").withColumnRenamed("sum(revenue)", "revenue")
    giq_df = giq_df.fillna({"free_app_download":0, "paid_app_download":0})
    giq_df = giq_df.withColumn("download", F.when(giq_df.free_app_download + giq_df.paid_app_download>0, giq_df.free_app_download + giq_df.paid_app_download).otherwise(F.lit(None)))

    #compare
    s1=giq_df.select(["device_code", "country_code", "genre_id", "download", "revenue"])
    s2=est_unified_df.select(["device_code", "country_code", "genre_id", "download", "revenue"])

    diff = s1.union(s2).subtract(s1.intersect(s2))
    if diff.count()>0:
        print "{}: FAIL".format(date)
        print diff.show(2)
    else:
        print  "{}: PASS".format(date)
    
    
def get_date_list(start_date, end_date, freq="D"):
    date_list = [x.strftime('%Y-%m-%d') for x in list(pd.date_range(start=start_date, end=end_date, freq=freq))]
    return date_list


# date_list = get_date_list("2010-07-04/", "2020-02-08/")
date_list = get_date_list("2019-01-01/", "2020-02-08/")
for date in date_list:
    try:
        compare(date)
    except Exception, e:
        print "{}: ERROR".format(date) 



In [0]:
%%sh

aws s3 ls s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date=2012-07-04/device_code=android-all/


In [0]:
%%sh
