In [0]:


import pandas as pd
from pyspark.sql import functions as F
from datetime import datetime
from applications.db_check_v1.common.db_check_utils import query
from conf.settings import PG_USAGE_HOSTS, PG_USAGE_NAME, PG_USAGE_ACCESS_ID, PG_USAGE_SECRET_KEY, \
    CITUS_USAGE_NAME, CITUS_USAGE_ACCESS_ID, CITUS_USAGE_HOSTS, CITUS_USAGE_SECRET_KEY, PG_AA_HOSTS, PG_AA_NAME,PG_AA_ACCESS_ID,PG_AA_SECRET_KEY

PLPROXY_DSN = (
    "dbname='{db}' user='{user}' password='{password}' "
    "host='{host}' port='{port}'".format(
        db=PG_USAGE_NAME,
        user=PG_USAGE_ACCESS_ID,
        host=PG_USAGE_HOSTS[0][0],
        password=PG_USAGE_SECRET_KEY,
        port=PG_USAGE_HOSTS[0][1]
    )
)

PG_AA_DSN = (
    "dbname='{db}' user='{user}' password='{password}' "
    "host='{host}' port='{port}'".format(
        db=PG_AA_NAME,
        user=PG_AA_ACCESS_ID,
        host=PG_AA_HOSTS[0][0],
        password=PG_AA_SECRET_KEY,
        port=PG_AA_HOSTS[0][1]
    )
)

def get_date_list(begin_date, end_date, freq):
    date_list = [x.strftime('%Y-%m-%d') for x in list(pd.date_range(start=begin_date, end=end_date, freq=freq))]
    return date_list


begin_date = datetime(2019, 1, 1)
end_date = datetime(2020, 9, 30)

DATE_GRANULARITY_MAPPINGLIST = {
    "daily": get_date_list(begin_date, end_date, "D"),
    "weekly": get_date_list(begin_date, end_date, "W-SAT"),
    "monthly": get_date_list(begin_date, end_date, "M")
    }


DATE_GRANULARITY_MAPPINGLIST["monthly"].reverse()
DATE_GRANULARITY_MAPPINGLIST["weekly"].reverse()
DATE_GRANULARITY_MAPPINGLIST["daily"].reverse()

In [0]:

granularity_list = ["monthly", "weekly", "daily"]
granularity_mapping = {"weekly":"WEEK", "daily":"DAY", "monthly":"MONTH"}
# granularity_list = ["monthly"]

usage_basic_after_transform = "s3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v6/fact/product_type_code=app/"
usage_basic_before_transform = "s3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v1/fact/"
domain_before_transform = "s3://b2c-prod-data-pipeline-unified-mobileweb-paid/unified/mobileweb.basic.v4/fact/granularity={granularity}/month={month}/date={date}/"
app_basic_raw_path = "s3://aardvark-prod-pdx-mdm-to-int/basic_kpi/version=1.0.0/range_type={range_type}/date={date}/"

def compare_count():
    for granularity in granularity_list:
        for date in  DATE_GRANULARITY_MAPPINGLIST[granularity]:
            # filter_str = "date ='{date}' and granularity = '{granularity}' and est_average_active_users is not null and est_average_active_users <> 0 and est_usage_penetration is not null and est_usage_penetration <> 0 ".format(date=date, granularity=granularity)
            filter_raw = "AU is not null and AU <> 0 and UP is not null and UP <> 0 ".format(date=date, granularity=granularity[:-2].upper())
            filter_str_after = "date = '{date}' and granularity_code = '{granularity}'".format(date=date, granularity=granularity)
            spark.read.format("parquet").load(app_basic_raw_path.format(range_type=granularity_mapping[granularity], date=date)).filter(filter_raw).createOrReplaceTempView("before_trans")
            spark.read.format("delta").load(usage_basic_after_transform).filter(filter_str_after).createOrReplaceTempView("after_trans")
            df_before = spark.sql("select count(1) as before_trans from before_trans ").collect()
            df_after = spark.sql("select count(1) as after_trans from after_trans").collect()
            if df_before[0][0] != df_after[0][0]:
                print granularity, date, "before", df_before[0][0], "after", df_after[0][0]
            else:
                print granularity, date, "PASS"

def compare_sum_value():
    for granularity in granularity_list:
        for date in  DATE_GRANULARITY_MAPPINGLIST[granularity]:
            filter_str = "date ='{date}' and granularity = '{granularity}' and est_average_active_users is not null and est_average_active_users <> 0 and est_usage_penetration is not null and est_usage_penetration <> 0 ".format(date=date, granularity=granularity)
            filter_str_after = "date = '{date}' and granularity_code = '{granularity}'".format(date=date, granularity=granularity)
            spark.read.format("delta").load(usage_basic_before_transform).filter(filter_str).createOrReplaceTempView("before_trans")
            spark.read.format("delta").load(usage_basic_after_transform).filter(filter_str_after).createOrReplaceTempView("after_trans")
            df_before = spark.sql("select sum(est_average_active_users) as before_trans from before_trans ").collect()
            df_after = spark.sql("select sum(est_active_users) as after_trans from after_trans ").collect()
            diff =  abs(df_before[0][0] - df_after[0][0]) / df_before[0][0]
            if diff > 0.000001:
                print granularity, date, "before", df_before[0][0], "after", df_after[0][0]
            else:
                print granularity, date, "PASS"

def compare_domain_count():
    usage_basic_after_transform = "s3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v6/fact/product_type_code=domain/"
    for granularity in granularity_list:
        for date in  DATE_GRANULARITY_MAPPINGLIST[granularity]:
            filter_str = "est_average_active_users is not null and est_average_active_users <> 0 and est_usage_penetration is not null and est_usage_penetration <> 0 "
            filter_str_after = "date = '{date}' and granularity_code = '{granularity}'".format(date=date, granularity=granularity)
            spark.read.format("delta").load(domain_before_transform.format(month=date[:-3].replace("-", ""), granularity=granularity[0], date=date)).filter(filter_str).createOrReplaceTempView("before_trans")
            spark.read.format("delta").load(usage_basic_after_transform).filter(filter_str_after).createOrReplaceTempView("after_trans")
            df_before = spark.sql("select count(1) as before_trans from before_trans ").collect()
            df_after = spark.sql("select count(1) as after_trans from after_trans").collect()
            if df_before[0][0] != df_after[0][0]:
                print granularity, date, "Failed", df_before[0][0], df_after[0][0]
            else:
                print granularity, date, "PASS", df_before[0][0], df_after[0][0]


def compare_domain_sum_value():
    usage_basic_after_transform = "s3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v6/fact/product_type_code=domain/"
    for granularity in granularity_list:
        for date in  DATE_GRANULARITY_MAPPINGLIST[granularity]:
            filter_str = "est_average_active_users is not null and est_average_active_users <> 0 and est_usage_penetration is not null and est_usage_penetration <> 0 "
            filter_str_after = "date = '{date}' and granularity_code = '{granularity}'".format(date=date, granularity=granularity)
            spark.read.format("delta").load(domain_before_transform.format(month=date[:-3].replace("-", ""), granularity=granularity[0], date=date)).filter(filter_str).createOrReplaceTempView("before_trans")
            spark.read.format("delta").load(usage_basic_after_transform).filter(filter_str_after).createOrReplaceTempView("after_trans")
            df_before = spark.sql("select sum(est_average_active_users) as before_trans from before_trans").collect()
            df_after = spark.sql("select sum(est_active_users) as after_trans from after_trans").collect()
            diff =  abs(df_before[0][0] - df_after[0][0]) / df_before[0][0]
            if diff > 0.000001:
                print granularity, date, "Failed", df_before[0][0], df_after[0][0]
            else:
                print granularity, date, "PASS", df_before[0][0], df_after[0][0]
        
compare_domain_sum_value()




In [0]:

age_gender_before_transform = "s3://b2c-prod-data-pipeline-unified-usage/unified/app-tech.usage.legacy-ag_app.v2/fact/granularity={granularity}/month={date}/"
age_gender_after_transform = "s3://b2c-prod-data-pipeline-unified-usage/unified/usage.seg-by-age-gender.v6/fact/granularity_code={granularity}/date={date}/"

granularity_list = ['monthly','weekly']

def compare_count():
    for granularity in granularity_list:
        for date in  DATE_GRANULARITY_MAPPINGLIST[granularity]:
            filter_before_str = "kpi=1 and estimate is not null and estimate <> 0 and date='{date}'".format(date=date)
            # filter_after_str = "date = '{date}' and granularity_code = '{granularity}'".format(date=date, granularity=granularity)
            spark.read.format("parquet").load(age_gender_before_transform.format(granularity=granularity, date=date[:-3])).filter(filter_before_str).createOrReplaceTempView("before_trans")
            spark.read.format("delta").load(age_gender_after_transform.format(granularity=granularity, date=date)).createOrReplaceTempView("after_trans")
            df_before = spark.sql("select count(1) as before_trans from before_trans ").collect()
            df_after = spark.sql("select count(1) as after_trans from after_trans").collect()
            if df_before[0][0] != df_after[0][0]:
                print granularity, date, "before", df_before[0][0], "after", df_after[0][0]
            else:
                print granularity, date, "PASS"
                
def compare_sum_value():
    for granularity in granularity_list:
        for date in  DATE_GRANULARITY_MAPPINGLIST[granularity]:
            filter_before_str = "kpi=1 and estimate is not null and estimate <> 0 and date='{date}'".format(date=date)
            spark.read.format("parquet").load(age_gender_before_transform.format(granularity=granularity, date=date[:-3])).filter(filter_before_str).createOrReplaceTempView("before_trans")
            spark.read.format("delta").load(age_gender_after_transform.format(granularity=granularity, date=date)).createOrReplaceTempView("after_trans")
            df_before = spark.sql("select sum(estimate) as before_trans from before_trans ").collect()
            df_after = spark.sql("select sum(est_active_users) as after_trans from after_trans ").collect()
            diff =  abs(df_before[0][0] - df_after[0][0]) / df_before[0][0]
            if diff > 0.000001:
                print granularity, date, "before", df_before[0][0], "after", df_after[0][0]
            else:
                print granularity, date, "PASS"
                
compare_sum_value()

In [0]:

seg_by_product_before_transform = "s3://b2c-prod-data-pipeline-unified-usage/unified/app-tech.usage.legacy-au_app.v2/fact/granularity={granularity}/month={date}/"
seg_by_product_after_transform = "s3://b2c-prod-data-pipeline-unified-usage/unified/usage.seg-by-product.v6/fact/granularity_code={granularity}/date={date}/"

granularity_list = ['monthly','weekly']

def compare():
    for granularity in granularity_list:
        for date in  DATE_GRANULARITY_MAPPINGLIST[granularity]:
            filter_before_str = "kpi=9 and estimate is not null and estimate <> 0 and date='{date}'".format(date=date)
            # filter_after_str = "est_active_users is not null and est_active_users <> 0"
            spark.read.format("parquet").load(seg_by_product_before_transform.format(granularity=granularity, date=date[:-3])).filter(filter_before_str).createOrReplaceTempView("before_trans")
            spark.read.format("delta").load(seg_by_product_after_transform.format(granularity=granularity, date=date)).createOrReplaceTempView("after_trans")
            df_before = spark.sql("select count(1) as before_trans_count, sum(estimate) as sum from before_trans ").collect()
            df_after = spark.sql("select count(1) as after_trans_count, sum(est_usage_penetration) as sum from after_trans").collect()
            diff =  abs(df_before[0][1] - df_after[0][1]) / df_before[0][1]
            
            if  diff < 0.000001 and (df_before[0][0] == df_after[0][0]):
                print granularity, date, "PASS"
            else:
                 print granularity, date, "before_count", df_before[0][0], "after_count", df_after[0][0], "before_sum", df_before[0][1], "after_sum", df_after[0][1]
                
compare()

In [0]:

seg_by_product_before_transform = "s3://b2c-prod-data-pipeline-unified-usage/unified/app-tech.usage.legacy-au_app.v2/fact/granularity=monthly/month=2019-01/"
filter_before_str = "kpi=9 and estimate is not null and estimate <> 0 and date='2019-01-31'"
spark.read.format("parquet").load(seg_by_product_before_transform).filter(filter_before_str).createOrReplaceTempView("before_trans")
spark.sql("select count(1) from  before_trans").show(1, False)

In [0]:

app_x_app_before_transform = "s3://b2c-prod-data-pipeline-unified-usage/unified/app-tech.usage.legacy-ca_app.v2/fact/granularity={granularity}/month={date}/"
cross_after_transform = "s3://b2c-prod-data-pipeline-unified-usage/unified/usage.cross-product.v6/fact/"

granularity_list = ['monthly']

def compare():
    for granularity in granularity_list:
        for date in  DATE_GRANULARITY_MAPPINGLIST[granularity]:
            filter_before_str = "kpi=9 and estimate is not null and estimate <> 0 and date='{date}'".format(date=date)
            spark.read.format("parquet").load(app_x_app_before_transform.format(granularity=granularity, date=date[:-3])).filter(filter_before_str).createOrReplaceTempView("before_trans")
            spark.read.format("delta").load(cross_after_transform).filter("cross_type = 'app_cross_app' and granularity_code='{granularity}' and date='{date}'".format(granularity=granularity, date=date)).createOrReplaceTempView("after_trans")
            df_before = spark.sql("select count(1) as before_trans_count, sum(estimate) as sum from before_trans ").collect()
            df_after = spark.sql("select count(1) as after_trans_count, sum(est_usage_penetration) as sum from after_trans").collect()
            diff =  abs(df_before[0][1] - df_after[0][1]) / df_before[0][1]
            
            if  diff < 0.000001 and (df_before[0][0] == df_after[0][0]):
                print granularity, date, "PASS"
            else:
                print granularity, date, "before_count", df_before[0][0], "after_count", df_after[0][0], "before_sum", df_before[0][1], "after_sum", df_after[0][1]
                
compare()

In [0]:

spark.sql("select count(1) from after_trans group by device_code order by device_code").show(10, False)
spark.sql("select count(distinct store_id) from before_trans group by device_id order by device_id").show(10, False)

In [0]:

app_retention_before_transfrom = "s3://b2c-prod-data-pipeline-unified-usage/unified/app-tech.usage.legacy-rt_app.v2/fact/granularity={granularity}/month={date}/"
retention_after_transform = "s3://b2c-prod-data-pipeline-unified-usage/unified/usage.cross-product.v6/fact/"

granularity_list = ['monthly']

def compare():
    for granularity in granularity_list:
        for date in  DATE_GRANULARITY_MAPPINGLIST[granularity]:
            filter_before_str = "kpi=9 and estimate is not null and estimate <> 0 and date='{date}'".format(date=date)
            spark.read.format("parquet").load(app_x_app_before_transform.format(granularity=granularity, date=date[:-3])).createOrReplaceTempView("before_trans")
            spark.read.format("delta").load(cross_after_transform).filter("cross_type = 'app_cross_app' and granularity_code='monthly' and date='2019-01-31'").createOrReplaceTempView("after_trans")
            spark.read.format("delta").load(cross_after_transform).createOrReplaceTempView("after_trans")
            df_before = spark.sql("select count(1) as before_trans_count, sum(estimate) as sum from before_trans ").collect()
            df_after = spark.sql("select count(1) as after_trans_count, sum(est_usage_penetration) as sum from after_trans").collect()
            diff =  abs(df_before[0][1] - df_after[0][1]) / df_before[0][1]
            
            if  diff < 0.000001 and (df_before[0][0] == df_after[0][0]):
                print granularity, date, "PASS"
            else:
                 print granularity, date, "before_count", df_before[0][0], "after_count", df_after[0][0], "before_sum", df_before[0][1], "after_sum", df_after[0][1]
                
compare()

In [0]:
%%sh
retention = s3://b2c-prod-data-pipeline-unified-usage/unified/usage.retention.v6/fact/product_type_code=domain/
aws s3 ls s3://b2c-prod-data-pipeline-unified-usage/unified/app-tech.usage.legacy-ca_app.v2/fact/

In [0]:
%%sh
