In [0]:


from pyspark.storagelevel import StorageLevel
from pyspark.sql.functions import *
from pyspark.sql.types import BooleanType

import json


## Production Environments

# snowflake production
sfOptions = {
  "sfURL" : "appannie_aa_int_prod.us-east-1.snowflakecomputing.com",
  "sfUser" : "app_bdp_data_validation_qa",
  "sfPassword" : "0HN#s@Wa5$1R8jVj",
  "sfDatabase" : "AA_INTELLIGENCE_PRODUCTION",
  "sfSchema" : "ADL_MASTER",
  "sfWarehouse" : "wh_dod_read7"
}
SNOWFLAKE_SOURCE_NAME = "net.snowflake.spark.snowflake"

query_statement = '''select event_type_name, product_key,change_time, old_value, new_value, meta from ADL_MASTER.DIM_EVENT_V2_CLUSTER_BY_PRODUCT_KEY 
                 where event_type_name='screenshot_change' and change_time >= '2021-05-01' '''
df = spark.read.format(SNOWFLAKE_SOURCE_NAME) \
  .options(**sfOptions) \
  .option("query", query_statement) \
  .load()

df.count()

def get_common_keys(old_keys, new_keys):
    common_keys = list()
    for old_key in old_keys:
        if old_key in new_keys:
            common_keys.append(old_key)
    return common_keys


@udf(returnType=BooleanType())
def is_cal_correct(old_value, new_value, meta_):
    old_data = json.loads(old_value)
    new_data = json.loads(new_value)
    meta_data = json.loads(meta_)
    
    if isinstance(old_data, list) or isinstance(new_data, list) or isinstance(meta_data, list):
        return True
    
    old_keys = old_data.keys()
    new_keys = new_data.keys()

    common_keys_ = get_common_keys(old_keys, new_keys)
    common_keys_final = sorted(common_keys_, reverse=True)
    """
    Example: 
    Meta  value:
        "default": "111"
        Old value: "default": ["1.png"] 
        New value: "default": ["2.png"]
    """
    for key, value in meta_data.items():
        for common_key in common_keys_final:
            if common_key.startswith(key):
                old_values = old_data[common_key]
                new_values = new_data[common_key]
                len_equal = (len(old_values) == len(new_values))
                if (value.find("1") == -1) and (not len_equal):
                    return False

    return True
    
    

result_df = df.filter("old_value is not null and new_value is not null and meta is not null").withColumn("is_cal_right",  is_cal_correct(col("old_value"), col("new_value"), col("meta")))
wrong_df = result_df.filter(col("is_cal_right")==False)
right_df = result_df.filter(col("is_cal_right")==True)
print("CAL_WRONG_COUNT:" + str(wrong_df.count()))
wrong_df.show(10, False)



In [0]:


from pyspark.storagelevel import StorageLevel
from pyspark.sql.functions import *
from pyspark.sql.types import BooleanType


import json


## Production Environments

# snowflake production
sfOptions = {
  "sfURL" : "appannie_aa_int_prod.us-east-1.snowflakecomputing.com",
  "sfUser" : "app_bdp_data_validation_qa",
  "sfPassword" : "0HN#s@Wa5$1R8jVj",
  "sfDatabase" : "AA_INTELLIGENCE_PRODUCTION",
  "sfSchema" : "ADL_MASTER",
  "sfWarehouse" : "wh_dod_read7"
}
SNOWFLAKE_SOURCE_NAME = "net.snowflake.spark.snowflake"

query_statement = '''select event_type_name, product_key,change_time, old_value, new_value, meta from ADL_MASTER.DIM_EVENT_V2_CLUSTER_BY_PRODUCT_KEY 
                 where event_type_name='screenshot_change' and change_time >= '2021-05-01' '''
df = spark.read.format(SNOWFLAKE_SOURCE_NAME) \
  .options(**sfOptions) \
  .option("query", query_statement) \
  .load()

df.count()


def get_ios_key(key_str):
    if key_str.startswith("iphone"):
        return "iphone"
    elif key_str.startswith("ipad"):
        return "ipad"
    else:
        return key_str

def merge_keys(src, dest):
    for key in dest:
        ios_key = get_ios_key(key)
        if ios_key not in src:
            src.append(ios_key)
    return src

def cal_meta_contains_no_new_value_key(meta_keys, merged_keys):
    for meta_key in meta_keys:
        if meta_key not in merged_keys:
            return False
    return True


@udf(returnType=BooleanType())
def is_key_consistent(old_value, new_value, meta_):
    old_data = json.loads(old_value)
    new_data = json.loads(new_value)
    meta_data = json.loads(meta_)
    ## skip the error data which old_data is a list
    if isinstance(old_data, list) or isinstance(new_data, list) or isinstance(meta_data, list):
        return True
    
    old_keys = list(old_data.keys())
    new_keys = list(new_data.keys())
    
    merged_keys_src = list()
    duplicated_keys = old_keys + new_keys
    
    merged_keys = merge_keys(merged_keys_src, duplicated_keys)
    meta_keys = list(meta_data.keys())

    merged_keys_sorted = sorted(merged_keys, reverse=True)
    meta_keys_sorted = sorted(meta_keys, reverse=True)
    if not cal_meta_contains_no_new_value_key(meta_keys_sorted, merged_keys_sorted):
        return False
    return True
    

result_df = df.filter("old_value is not null and new_value is not null and meta is not null").withColumn("is_key_consistent",  is_key_consistent(col("old_value"), col("new_value"), col("meta")))
wrong_df = result_df.filter(col("is_key_consistent")==False)
right_df = result_df.filter(col("is_key_consistent")==True)
print("INCONSISTENT_COUNT:" + str(wrong_df.count()))
wrong_df.show(10, False)



In [0]:


sfOptions = {
  "sfURL" : "appannie_aa_int_prod.us-east-1.snowflakecomputing.com",
  "sfUser" : "app_bdp_data_validation_qa",
  "sfPassword" : "0HN#s@Wa5$1R8jVj",
  "sfDatabase" : "AA_INTELLIGENCE_PRODUCTION",
  "sfWarehouse" : "wh_dod_read7"
}
SNOWFLAKE_SOURCE_NAME = "net.snowflake.spark.snowflake"

query_statement = '''select event_type_name, product_key,change_time, old_value, new_value, meta from ADL_MASTER.DIM_EVENT_V2_CLUSTER_BY_PRODUCT_KEY 
                 where event_type_name='screenshot_change' and change_time >= '2021-05-01' '''
df = spark.read.format(SNOWFLAKE_SOURCE_NAME) \
  .options(**sfOptions) \
  .option("query", query_statement) \
  .load()
  
# df.show(10)

df2 = spark.sql("select * from  ADL_MASTER.DIM_EVENT_V2_CLUSTER_BY_PRODUCT_KEY limit 10")
df2.show(1)


In [0]:
%%sh

# aws s3 cp s3://b2c-prod-dca-bdp-data/BDP-PROD-APP-INT-QA/user_data/zeppelin_521/notebook/2G9J8782Y/note.json - 

# aws s3 cp  s3://b2c-prod-dca-bdp-data/BDP-PROD-APP-INT-QA/user_data/zeppelin/notebook/2DRSB9VB4/note.json - 

# aws s3 ls  s3://b2c-prod-dca-bdp-data/BDP-PROD-APP-INT-QA/user_data/zeppelin/notebook/2DRSB9VB4/note.json

aws s3 ls s3://b2c-prod-dca-bdp-data/BDP-PROD-APP-INT-QA/user_data/zeppelin_521/notebook/ | wc -l

In [0]:
%%sh
