In [0]:
%%sh
# aws s3 ls s3://aardvark-prod-pdx-mdm-to-int/usage_retention/batch=retention_batch_D7_routine/version=v1.0.0/range_type=MONTH/date=2021-10-31/

# aws s3 ls s3://b2c-prod-dca-bdp-data/BDP-PROD-APP-INT-QA/bdp/user_data/application_name=aa_data_test_app/application_code/version=latest/code/bdp_resources/

aws s3 ls s3://b2c-prod-dca-bdp-data/BDP-PROD-APP-INT-QA/bdp/user_data/application_name=aa_data_test_app_debug/application_code/version=latest/code/bdp_resources/

In [0]:

## Spark Parquet Loader Test Example
## Load the data test lib to python runtime
spark.sparkContext.addPyFile("s3://b2c-prod-dca-bdp-data/BDP-PROD-APP-INT-QA/bdp/user_data/application_name=aa_data_test_app/application_code/version=latest/code/bdp_resources/usr/lib/spark/python/dependencies.zip")

# import spark loader
from data_lib.api import spark_loader

loader = spark_loader(spark)

p_path = "s3://b2c-prod-data-pipeline-qa/fredric/retention_data_2021_10_31/"

## Build a temp view in memory called retention
df = loader.load_parquet_as_table(p_path, "retention")

print(df.count())
df_metrics_names = spark.sql("select distinct(metric_name) from retention")
df_metrics_names.show(df_metrics_names.count())


In [0]:
  
## Convert narrow table to wide example
from pyspark.sql.functions import *
from data_lib.api import NarrowToWideParams, convert_narrow_table_to_wide
from data_lib.api import spark_loader

loader = spark_loader(spark)
p_path = "s3://b2c-prod-data-pipeline-qa/fredric/retention_data_2021_10_31/"

## Load the parquet to memory and give it a name retention_narrow
df = loader.load_parquet_as_table(p_path, "retention_narrow")

df.show(1, False)

narrow_to_wide_p = NarrowToWideParams("retention_narrow", ["app_id", "device_type", "country", "age", "gender", "platform"], "metric_name", ["RRD0", "RRD1", "RRD2", "RRD3", "RRD4", "RRD5", "RRD6", "RRD7"], "value", None)
wide_df = convert_narrow_table_to_wide(spark, narrow_to_wide_p)

## Save wide df to a new path of QA s3
# wide_df.write.parquet("s3://b2c-prod-data-pipeline-qa/fredric/retention_data_2021_10_31_wide/")

wide_df.show(5, False)



In [0]:


from pyspark.sql.functions import *
from data_lib.api import spark_loader
from data_lib.api.data_op import compare_columns_and_get_result_df

## Load wide df & do some comparison

loader = spark_loader(spark)
p_path = "s3://b2c-prod-data-pipeline-qa/fredric/retention_data_2021_10_31_wide/"

## this one is raw data in S3
wide_df = loader.load_parquet_as_table(p_path, "retention_narrow")

print(wide_df.count())

sf_df = loader.load_sf_as_table("select * from AA_INTELLIGENCE_PRODUCTION.ADL_USAGE_PAID.FACT_USAGE_RETENTION_DAY_V1_CLUSTER_BY_DATE where date_key=20211031 ", "retention_sf")
## Snowfake data here
print(sf_df.count())

joined_df = wide_df.join(sf_df, [wide_df.app_id==sf_df.PRODUCT_KEY, wide_df.country==sf_df.COUNTRY_CODE,  wide_df.device_type==sf_df.DEVICE_KEY], "inner")

joined_df = joined_df.withColumn("RRD1_SRC", (col("EST_TOTAL_RETENTION_USERS_D0") * col("RRD1")).cast("int")) \
    .withColumn("RRD2_SRC",  (col("EST_TOTAL_RETENTION_USERS_D0") * col("RRD2")).cast("int")) \
    .withColumn("RRD1_DST", col("EST_TOTAL_RETENTION_USERS_D1").cast("int")) \
    .withColumn("RRD2_DST", col("EST_TOTAL_RETENTION_USERS_D2").cast("int"))

## "RRD1_SRC", "RRD2_SRC"  Raw Data metrics
## "RRD1_DST", "RRD2_DST"  Snowfake data metrics
res_df = compare_columns_and_get_result_df(joined_df, ["RRD1_SRC", "RRD2_SRC"], ["RRD1_DST", "RRD2_DST"])

equals_df = res_df.filter(col("cal_result") == True)
not_equals_df = res_df.filter(col("cal_result") == False)
print(equals_df.count())
print(not_equals_df.count())

not_equals_df.show(1)


In [0]:


## Extended
## Apache Arrow Loader, a C++ based column-storaged library.
from data_lib.api import arrow_loader

loader = arrow_loader() 
query = "select * from AA_INTELLIGENCE_PRODUCTION.ADL_USAGE_PAID.FACT_USAGE_RETENTION_DAY_V1_CLUSTER_BY_DATE where date_key=20211031"
df_table = loader.load_sf(query)

is_greater_than_zero = True
for i in range(0, len(df_table)):
    r_d1 = df_table["EST_TOTAL_RETENTION_USERS_D1"][i].as_py()
    if r_d1 is not None and  r_d1 < 0:
        print(r_d1)
        is_greater_than_zero = False
        break

assert is_greater_than_zero is True, "Test retention_d1 greater than zero failed"




In [0]:


## Extended
## Pandas loader 

## Highly Recommend you not to use pandas loader & Operations, it's too slower

from data_lib.api import pd_loader

loader = pd_loader() 
query = "select * from AA_INTELLIGENCE_PRODUCTION.ADL_USAGE_PAID.FACT_USAGE_RETENTION_DAY_V1_CLUSTER_BY_DATE where date_key=20211031"
df_table = loader.load_sf(query)

i = 0
is_greater_than_zero = True
for _, row in df_table.iterrows():
    r_d1 = row["EST_TOTAL_RETENTION_USERS_D1"]
    if r_d1 is not None and  r_d1 < 0:
        is_greater_than_zero = False
        break

assert is_greater_than_zero is True, "Test retention_d1 greater than zero failed"

    


In [0]:
 

from pyspark.sql.functions import *
from data_lib.api import spark_loader
from data_lib.api.data_op import compare_columns_and_get_result_df
from data_lib.api import Conv

## Load wide df & Then convert to Apache arrow df and do some comparison

loader = spark_loader(spark)
p_path = "s3://b2c-prod-data-pipeline-qa/fredric/retention_data_2021_10_31_wide/"

## this one is raw data in S3
wide_df = loader.load_parquet_as_table(p_path, "retention_wide")

## Snowfake data here
sf_df = loader.load_sf_as_table("select * from AA_INTELLIGENCE_PRODUCTION.ADL_USAGE_PAID.FACT_USAGE_RETENTION_DAY_V1_CLUSTER_BY_DATE where date_key=20211031 ", "retention_sf")


## Join snowflake and raw data, then select the specific field
joined_df = wide_df.join(sf_df, [wide_df.app_id==sf_df.PRODUCT_KEY, wide_df.country==sf_df.COUNTRY_CODE,  wide_df.device_type==sf_df.DEVICE_KEY], "inner") \
    .select("RRD1", "RRD2", "EST_TOTAL_RETENTION_USERS_D0", "EST_TOTAL_RETENTION_USERS_D1", "EST_TOTAL_RETENTION_USERS_D2")

print(joined_df.count())

## Convert pyspark table to Apache Arrow table
arrow_df = Conv.spark_df_to_arrow(joined_df)


## Do some comparison using raw Python API
is_equal = True

for i in range(0, len(arrow_df)):
    total_rd = arrow_df["EST_TOTAL_RETENTION_USERS_D0"][i].as_py()
    
    src_rd_percent1 = arrow_df["RRD1"][i].as_py()
    src_rd_percent2 = arrow_df["RRD2"][i].as_py()
    
    act_r_d1 = arrow_df["EST_TOTAL_RETENTION_USERS_D1"][i].as_py()
    act_r_d2 = arrow_df["EST_TOTAL_RETENTION_USERS_D2"][i].as_py()
    if act_r_d1 is not None and act_r_d2 is not None and  act_r_d1 >= 0:
        src_rd1 = int(total_rd * src_rd_percent1)
        src_rd2 = int(total_rd * src_rd_percent2)
        dst_rd1 = int(act_r_d1)
        dst_rd2 = int(act_r_d2)
        if i < 10:
            print("src_rd1: " + str(src_rd1))
            print("src_rd2: " + str(src_rd2))
            print("dst_rd1: " + str(dst_rd1))
            print("dst_rd2: " + str(dst_rd2)) 
        if src_rd1 != dst_rd1 or src_rd2 != dst_rd2:
            is_equal = False
            break

assert is_equal is True, "Test retention_d1 and  retention_d2 equals failed"

In [0]:


from data_lib.api import get_date_granularity_mapping_list
import datetime

## Test get date granularity mapping list
begin_date = datetime.datetime(2020, 9, 20, 16, 59)
end_date = datetime.datetime(2020, 9, 26, 16, 59)
date_granularity_mapping_list = get_date_granularity_mapping_list(begin_date, end_date)

print(date_granularity_mapping_list)


In [0]:


from data_lib.api import arrow_loader
from data_lib.utils.const import AURORA_OPTIONS

loader =  arrow_loader()
df = loader.load_aurora("select * from adl_master.dim_event_service_v1 limit 10", AURORA_OPTIONS)


for i in range(0, len(df)):
    print(str(df["product_key"][i].as_py()) + "\t" + df["event_type_name"][i].as_py() + "\t" + df["old_value"][i].as_py() + "\t" + df["new_value"][i].as_py())
    print("\n")
    



