In [0]:
%%sh
aws s3 ls s3://aardvark-prod-pdx-mdm-to-int/usage_retention/batch=retention_batch_D7_routine/version=v1.0.0/range_type=MONTH/date=2021-10-31/


In [0]:

## Spark Parquet Loader Test Example
## Load the data test lib to python runtime
spark.sparkContext.addPyFile("s3://b2c-prod-dca-bdp-data/BDP-PROD-APP-INT-QA/bdp/user_data/application_name=aa_data_test_app/application_code/version=latest/code/bdp_resources/usr/lib/spark/python/dependencies.zip")

# import spark loader
from data_lib.api import spark_loader

loader = spark_loader(spark)

p_path = "s3://aardvark-prod-pdx-mdm-to-int/usage_retention/batch=retention_batch_D7_routine/version=v1.0.0/range_type=MONTH/date=2021-10-31/"

## Build a temp view in memory called retention
df = loader.load_parquet_as_table(p_path, "retention")

print(df.count())
df_metrics_names = spark.sql("select distinct(metric_name) from retention")
df_metrics_names.show(df_metrics_names.count())


In [0]:
  
## Convert narrow table to wide example
from pyspark.sql.functions import *
from data_lib.api import NarrowToWideParams, convert_narrow_table_to_wide
from data_lib.api import spark_loader

loader = spark_loader(spark)
p_path = "s3://b2c-prod-data-pipeline-qa/fredric/retention_data_2021_10_31/"
df = loader.load_parquet_as_table(p_path, "retention_narrow")

df.show(1, False)

narrow_to_wide_p = NarrowToWideParams("retention_narrow", ["app_id", "device_type", "country", "age", "gender", "platform"], "metric_name", ["RRD0", "RRD1", "RRD2", "RRD3", "RRD4", "RRD5", "RRD6", "RRD7"], "value", None)
wide_df = convert_narrow_table_to_wide(spark, narrow_to_wide_p)

## Save wide df to a new path of QA s3
# wide_df.write.parquet("s3://b2c-prod-data-pipeline-qa/fredric/retention_data_2021_10_31_wide/")

wide_df.show(5, False)



In [0]:


from pyspark.sql.functions import *
from data_lib.api import spark_loader
from data_lib.api.data_op import compare_columns_and_get_result_df

## Load wide df & do some comparison

loader = spark_loader(spark)
p_path = "s3://b2c-prod-data-pipeline-qa/fredric/retention_data_2021_10_31_wide/"
wide_df = loader.load_parquet_as_table(p_path, "retention_narrow")

print(wide_df.count())

sf_df = loader.load_sf_as_table("select * from AA_INTELLIGENCE_PRODUCTION.ADL_USAGE_PAID.FACT_USAGE_RETENTION_DAY_V1_CLUSTER_BY_DATE where date_key=20211031 ", "retention_sf")
print(sf_df.count())

joined_df = wide_df.join(sf_df, [wide_df.app_id==sf_df.PRODUCT_KEY, wide_df.country==sf_df.COUNTRY_CODE,  wide_df.device_type==sf_df.DEVICE_KEY], "inner")

joined_df = joined_df.withColumn("RRD1_SRC", (col("EST_TOTAL_RETENTION_USERS_D0") * col("RRD1")).cast("int")) \
    .withColumn("RRD2_SRC",  (col("EST_TOTAL_RETENTION_USERS_D0") * col("RRD2")).cast("int")) \
    .withColumn("RRD1_DST", col("EST_TOTAL_RETENTION_USERS_D1").cast("int")) \
    .withColumn("RRD2_DST", col("EST_TOTAL_RETENTION_USERS_D2").cast("int"))
    
res_df = compare_columns_and_get_result_df(joined_df, ["RRD1_SRC", "RRD2_SRC"], ["RRD1_DST", "RRD2_DST"])

equals_df = res_df.filter(col("cal_result") == True)
not_equals_df = res_df.filter(col("cal_result") == False)
print(equals_df.count())
print(not_equals_df.count())

not_equals_df.show(1)


In [0]:


## Extended
## Apache Arrow Loader, a C++ based column-storaged library.
from data_lib.api import arrow_loader

loader = arrow_loader() 
query = "select * from AA_INTELLIGENCE_PRODUCTION.ADL_USAGE_PAID.FACT_USAGE_RETENTION_DAY_V1_CLUSTER_BY_DATE where date_key=20211031"
df_table = loader.load_sf(query)

is_greater_than_zero = True
for i in range(0, len(df_table)):
    r_d1 = df_table["EST_TOTAL_RETENTION_USERS_D1"][i].as_py()
    if r_d1 is not None and  r_d1 < 0:
        print(r_d1)
        is_greater_than_zero = False
        break

assert is_greater_than_zero is True, "Test retention_d1 greater than zero failed"




In [0]:


## Extended
## Pandas loader 

from data_lib.api import pd_loader

loader = pd_loader() 
query = "select * from AA_INTELLIGENCE_PRODUCTION.ADL_USAGE_PAID.FACT_USAGE_RETENTION_DAY_V1_CLUSTER_BY_DATE where date_key=20211031"
df_table = loader.load_sf(query)

i = 0
is_greater_than_zero = True
for _, row in df_table.iterrows():
    r_d1 = row["EST_TOTAL_RETENTION_USERS_D1"]
    if r_d1 is not None and  r_d1 < 0:
        is_greater_than_zero = False
        break

assert is_greater_than_zero is True, "Test retention_d1 greater than zero failed"

    
