In [0]:

cpx_df = spark.read.parquet('s3://aardvark-prod-dca-data/oss/CAVIAR_KEYWORD_METRICS/version=1.1.0/range_type=DAY/date=2021-04-01/')
cpx_df.printSchema()

In [0]:
%%sh
aws s3 ls s3://aardvark-prod-dca-data/oss/CAVIAR_KEYWORD_METRICS/version=1.1.0/range_type=DAY/date=2021-04-01/


In [0]:
%%sh
# unified data
aws s3 ls s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.product-download-channel.v1/fact/granularity_code=daily/
## market_code  apple-store google-play



In [0]:

from pyspark.storagelevel import StorageLevel
from pyspark.sql.functions import *
from pyspark.sql.types import BooleanType

raw_df = spark.read.parquet("s3://aardvark-prod-dca-data/fact/DOWNLOAD_CHANNEL_KPI/version=1.0.0/date=2021-02-19/")

raw_df.persist(StorageLevel.MEMORY_AND_DISK)

raw_df.createOrReplaceTempView("raw_df")

# raw_df.printSchema()
raw_df.show(10, False)

## 加入 None 列扩充
raw_df_inter = (raw_df.withColumn("est_organic_featured_share", when(col('metric_name') == 'predicted_featured_percent', col('metric_value')).otherwise(lit(None)))
                .withColumn("est_organic_search_share", when(col('metric_name') == 'predicted_true_organic_percent', col('metric_value')).otherwise(lit(None)))
                .withColumn("est_paid_in_app_ads_share", when(col('metric_name') == 'predicted_paid_in_app_ads_percent', col('metric_value')).otherwise(lit(None)))
                .withColumn("est_paid_search_share", when(col('metric_name') == 'predicted_paid_search_percent', col('metric_value')).otherwise(lit(None)))
                )

## 分别选出不为空的 est_organic_featured_share, est_organic_search_share, est_paid_in_app_ads_share和est_paid_search_share值
df1 = raw_df_inter.filter("est_organic_featured_share is not null").select("app_id", "country_code", "platform", "device_type","est_organic_featured_share").distinct()
df2 = raw_df_inter.filter("est_organic_search_share is not null").select("app_id", "country_code", "platform", "device_type", "est_organic_search_share").distinct()
df3 = raw_df_inter.filter("est_paid_in_app_ads_share is not null").select("app_id", "country_code", "platform", "device_type", "est_paid_in_app_ads_share").distinct()
df4 = raw_df_inter.filter("est_paid_search_share is not null").select("app_id", "country_code", "platform", "device_type", "est_paid_search_share").distinct()

## 4个df inner join得结果
res_df = df1.join(df2, ["app_id", "country_code", "platform", "device_type"]).join(df3, ["app_id", "country_code", "platform",  "device_type"]).join(df4, ["app_id", "country_code", "platform",  "device_type"])

res_df.persist(StorageLevel.MEMORY_AND_DISK)

unfied_df = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.product-download-channel.v1/fact/granularity_code=daily/date=2021-02-19/")

unfied_df.persist(StorageLevel.MEMORY_AND_DISK)

unfied_df.createOrReplaceTempView("unfied_df")

res_df1 = res_df.withColumnRenamed("est_organic_featured_share", "est_organic_featured_share_src") \
    .withColumnRenamed("est_organic_search_share", "est_organic_search_share_src") \
    .withColumnRenamed("est_paid_in_app_ads_share", "est_paid_in_app_ads_share_src") \
    .withColumnRenamed("est_paid_search_share", "est_paid_search_share_src") \
    .withColumnRenamed("app_id", "product_key") \
    .withColumn("device_key", when(col('device_type') == '0', lit(7)).otherwise(col('device_type')))
    
final_df = res_df1.join(unfied_df, ["product_key", "country_code", "device_key"])

@udf(returnType=BooleanType())
def cal_diff(est_organic_featured_share_src, est_organic_search_share_src, est_paid_in_app_ads_share_src, est_paid_search_share_src, est_organic_featured_share, est_organic_search_share, est_paid_in_app_ads_share, est_paid_search_share):
    if est_organic_featured_share_src ==  est_organic_featured_share and est_organic_search_share_src == est_organic_search_share and est_paid_in_app_ads_share_src == est_paid_in_app_ads_share and  est_paid_search_share_src ==  est_paid_search_share:
        return True
    return False

diff_df = final_df.withColumn("is_same_metric", cal_diff(col("est_organic_featured_share_src"), col("est_organic_search_share_src"), col("est_paid_in_app_ads_share_src"), col("est_paid_search_share_src"), col("est_organic_featured_share"), col("est_organic_search_share"), col("est_paid_in_app_ads_share"), col("est_paid_search_share")))

diff_df.select("is_same_metric").filter(col("is_same_metric")==False).count()
        




In [0]:

from pyspark.storagelevel import StorageLevel
from pyspark.sql.functions import *
from pyspark.sql.types import BooleanType

date_ = '2021-02-19'

unfied_df = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.product-download-channel.v1/fact/granularity_code=daily/date={}/".format(date_))

# snowflake production
sfOptions = {
  "sfURL" : "appannie_aa_int_prod.us-east-1.snowflakecomputing.com",
  "sfUser" : "app_bdp_data_validation_qa",
  "sfPassword" : "0HN#s@Wa5$1R8jVj",
  "sfDatabase" : "AA_INTELLIGENCE_PRODUCTION",
  "sfSchema" : "ADL_STORE_PAID",
  "sfWarehouse" : "wh_dod_read7"
}
SNOWFLAKE_SOURCE_NAME = "net.snowflake.spark.snowflake"
db_df = spark.read.format(SNOWFLAKE_SOURCE_NAME) \
  .options(**sfOptions) \
  .option("query",  "select * from FACT_STORE_PRODUCT_DOWNLOAD_CHANNEL_V1_CLUSTER_BY_DATE where date='{}'".format(date_)) \
  .load()

res_df = db_df.join(unfied_df, ['product_key', 'country_code', 'device_key'], 'left')

res_df.persist(StorageLevel.MEMORY_AND_DISK)
res_df.createOrReplaceTempView("res_df")
caled_df = spark.sql("select  country_code,est_download, EST_DOWNLOAD*est_organic_search_share as   est_organic_search_download1, est_organic_search_download, EST_DOWNLOAD*est_organic_featured_share as   est_organic_featured_download1, est_organic_featured_download,  EST_DOWNLOAD*est_paid_in_app_ads_share as  est_paid_in_app_ads_download1,     est_paid_in_app_ads_download,  EST_DOWNLOAD*est_paid_search_share as est_paid_search_download1, est_paid_search_download from res_df")

@udf(returnType=BooleanType())
def cal_right(est_download,est_organic_featured_share_src, est_organic_search_share_src, est_paid_in_app_ads_share_src, est_paid_search_share_src, est_organic_featured_share, est_organic_search_share, est_paid_in_app_ads_share, est_paid_search_share):
    
    if  est_organic_search_share_src is None:
        if est_download ==  est_organic_search_share:
            return True
        else:
            return False
    
    res = (
        ((int(est_organic_featured_share_src) ==  est_organic_featured_share)  or (int(est_organic_featured_share_src+1) ==  est_organic_featured_share))
            and ((int(est_organic_search_share_src) == est_organic_search_share) or (int(est_organic_search_share_src+1) == est_organic_search_share))
            and ((int(est_paid_in_app_ads_share_src) == est_paid_in_app_ads_share) or (int(est_paid_in_app_ads_share_src+1) == est_paid_in_app_ads_share))
            and ((int(est_paid_search_share_src) ==  est_paid_search_share)  or  (int(est_paid_search_share_src+1) ==  est_paid_search_share))
            )

    return res
    
new_df = caled_df.withColumn("cal_right", cal_right(col("est_download"), col('est_organic_featured_download1'), col('est_organic_search_download1'), col('est_paid_in_app_ads_download1'), col('est_paid_search_download1'), col('est_organic_featured_download'), col('est_organic_search_download'), col('est_paid_in_app_ads_download'), col('est_paid_search_download')))

new_df.select("cal_right").filter(col("cal_right")==False).count()


In [0]:


from pyspark.storagelevel import StorageLevel
from pyspark.sql.functions import *
from pyspark.sql.types import BooleanType

def print_count(date_):
    unfied_df = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.product-download-channel.v1/fact/granularity_code=daily/date={}/".format(date_))

    # snowflake production
    sfOptions = {
    "sfURL" : "appannie_aa_int_prod.us-east-1.snowflakecomputing.com",
    "sfUser" : "app_bdp_data_validation_qa",
    "sfPassword" : "0HN#s@Wa5$1R8jVj",
    "sfDatabase" : "AA_INTELLIGENCE_PRODUCTION",
    "sfSchema" : "ADL_STORE_PAID",
    "sfWarehouse" : "wh_dod_read7"
    }
    SNOWFLAKE_SOURCE_NAME = "net.snowflake.spark.snowflake"
    db_df = spark.read.format(SNOWFLAKE_SOURCE_NAME) \
    .options(**sfOptions) \
    .option("query",  "select * from FACT_STORE_PRODUCT_DOWNLOAD_CHANNEL_V1_CLUSTER_BY_DATE where date='{}'".format(date_)) \
    .load()
    
    print(unfied_df.select("product_key").distinct().count())
    print(db_df.select("product_key").distinct().count())
    

def get_date_range_count(prefix, start, end):
    for i in range(start, end+1):
        date_ = prefix + str(i)
        print("Date: " + date_)
        print_count(date_)
        print("\n")
       
        
prefix = '2021-02-'
get_date_range_count(prefix, 19, 28)





In [0]:

from pyspark.storagelevel import StorageLevel
from pyspark.sql.functions import *
from pyspark.sql.types import BooleanType

date_ = '2021-02-19'

unfied_df = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.product-download-channel.v1/fact/granularity_code=daily/date={}/".format(date_))
unfied_df.show(10, False)

In [0]:
%%sh

aws s3 ls s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.product-download-channel.v1/fact/granularity_code=daily/



In [0]:

## snowflake testing
sfOptions = {
  "sfURL" : "appannie.us-east-1.snowflakecomputing.com",
  "sfUser" : "fzhu",
  "sfPassword" : "Lily870104",
  "sfDatabase" : "ONE_SERVICE_TEST_DB_AABJFZHU",
  "sfSchema" : "ADL_STORE_PAID",
  "sfWarehouse" : "DEMO_WH"
}
SNOWFLAKE_SOURCE_NAME = "net.snowflake.spark.snowflake"
df = spark.read.format(SNOWFLAKE_SOURCE_NAME) \
  .options(**sfOptions) \
  .option("query",  "select * from FACT_STORE_MARKET_DOWNLOAD_REVENUE_V1 limit 10") \
  .load()
df.show()


In [0]:


# snowflake production
sfOptions = {
  "sfURL" : "appannie_aa_int_prod.us-east-1.snowflakecomputing.com",
  "sfUser" : "app_bdp_data_validation_qa",
  "sfPassword" : "0HN#s@Wa5$1R8jVj",
  "sfDatabase" : "AA_INTELLIGENCE_PRODUCTION",
  "sfSchema" : "ADL_ASO_PAID",
  "sfWarehouse" : "wh_dod_read7"
}
SNOWFLAKE_SOURCE_NAME = "net.snowflake.spark.snowflake"
df = spark.read.format(SNOWFLAKE_SOURCE_NAME) \
  .options(**sfOptions) \
  .option("query",  "select * from FACT_ASO_KEYWORD_CPX_V1_CLUSTER_BY_DATE limit 10") \
  .load()
df.show()

In [0]:
%%sh

find / -name "site-packages"


In [0]:


import sys
sys.path.append("/mnt/application_name=bdp_fredric_test/application_code/version=latest/code/bdp_resources/python/")
import vector_add

c = vector_add.CT(range(1, 2000000))
res = c.vector_add()

res





