In [0]:

import pandas as pd
from pyspark.sql import functions as F
from datetime import datetime
from applications.db_check_v1.common.db_check_utils import query
from conf.settings import PG_USAGE_HOSTS, PG_USAGE_NAME, PG_USAGE_ACCESS_ID, PG_USAGE_SECRET_KEY, \
    CITUS_USAGE_NAME, CITUS_USAGE_ACCESS_ID, CITUS_USAGE_HOSTS, CITUS_USAGE_SECRET_KEY

PLPROXY_DSN = (
    "dbname='{db}' user='{user}' password='{password}' "
    "host='{host}' port='{port}'".format(
        db=PG_USAGE_NAME,
        user=PG_USAGE_ACCESS_ID,
        host=PG_USAGE_HOSTS[0][0],
        password=PG_USAGE_SECRET_KEY,
        port=PG_USAGE_HOSTS[0][1]
    )
)

def get_date_list(begin_date, end_date, freq):
    date_list = [x.strftime('%Y-%m-%d') for x in list(pd.date_range(start=begin_date, end=end_date, freq=freq))]
    return date_list


begin_date = datetime(2020, 1, 01)
end_date = datetime(2020, 3, 31)

DATE_GRANULARITY_MAPPINGLIST = {
    "daily": get_date_list(begin_date, end_date, "D"),
    "weekly": get_date_list(begin_date, end_date, "W-SAT"),
    "monthly": get_date_list(begin_date, end_date, "M")
}


DATE_GRANULARITY_MAPPINGLIST["monthly"].reverse()
DATE_GRANULARITY_MAPPINGLIST["weekly"].reverse()
DATE_GRANULARITY_MAPPINGLIST["daily"].reverse()

print DATE_GRANULARITY_MAPPINGLIST["monthly"]
print DATE_GRANULARITY_MAPPINGLIST["weekly"]
print DATE_GRANULARITY_MAPPINGLIST["daily"]

In [0]:

granularity_list = ["monthly", "weekly", "daily"]
category_path = "s3://b2c-prod-data-pipeline-unified-usage/unified/usage.category.v6/dimension/product_type_code=app/"
usage_basic_after_transform = "s3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v6/fact/product_type_code=app/"
for granularity in granularity_list:
    filter_str = "date between '2020-01-01' and '2020-03-31' and granularity_code = '{granularity}'".format(granularity=granularity)
    print filter_str
    spark.read.format("delta").load(category_path).filter(filter_str).createOrReplaceTempView("category_view")
    spark.read.format("delta").load(usage_basic_after_transform).filter(filter_str).createOrReplaceTempView("basic_view")
    spark.sql("""
    select basic.DATE_KEY,
        basic.DEVICE_CODE,
        basic.COUNTRY_CODE,basic.COUNTRY_KEY,
        basic.date,
        basic.DEVICE_KEY,basic.GRANULARITY_CODE,basic.GRANULARITY_KEY,basic.MARKET_CODE,
        basic.MARKET_KEY,basic.PRODUCT_KEY,basic.PRODUCT_TYPE_CODE,basic.PRODUCT_TYPE_KEY,
        basic.EST_TOTAL_TIME_MILLISECONDS_OF_MAIN_CATEGORY,basic.EST_WIFI_BYTES,basic.EST_ACTIVE_USERS,
        basic.EST_TOTAL_BYTES,basic.EST_TOTAL_SESSION_COUNT_OF_MAIN_CATEGORY,
        basic.EST_TOTAL_BYTES_OF_MAIN_CATEGORY,basic.EST_TOTAL_ACTIVE_DAYS,basic.EST_INSTALL_BASE,
        basic.EST_POPULATION,basic.EST_TOTAL_TIME_MILLISECONDS,basic.EST_TOTAL_SESSION_COUNT,
        basic.DEVICE_FORM_FACTOR_CODE,basic.DEVICE_FORM_FACTOR_KEY,basic.PARENT_DEVICE_CODE,basic.PARENT_DEVICE_KEY,basic.PLATFORM_CODE,basic.PLATFORM_KEY,
        basic.est_usage_penetration,
        map.category_key as category_key,
        map.unified_category_key as unified_category_key
    from basic_view basic
    join category_view map
    on basic.date = map.date
    and basic.product_key = map.product_key
    and basic.granularity_code = map.granularity_code
    and basic.country_code = map.country_code
    and basic.device_code = map.device_code
    """).createOrReplaceTempView("basic_with_category_{granularity}".format(granularity=granularity))
    # spark.sql("select count(1) from category_view").show(1, False)
    # spark.sql("select count(1) from basic_with_category_{granularity}".format(granularity=granularity)).show(1, False)
    


In [0]:

import pandas as pd
from pyspark.sql import functions as F
from datetime import datetime
from applications.db_check_v1.common.db_check_utils import query
from pyspark.sql import Row
from conf.settings import PG_USAGE_HOSTS, PG_USAGE_NAME, PG_USAGE_ACCESS_ID, PG_USAGE_SECRET_KEY, \
    CITUS_USAGE_NAME, CITUS_USAGE_ACCESS_ID, CITUS_USAGE_HOSTS, CITUS_USAGE_SECRET_KEY
from pyspark.sql.types import StructType, StructField, LongType, IntegerType, DoubleType, ShortType

PLPROXY_DSN = (
    "dbname='{db}' user='{user}' password='{password}' "
    "host='{host}' port='{port}'".format(
        db=PG_USAGE_NAME,
        user=PG_USAGE_ACCESS_ID,
        host=PG_USAGE_HOSTS[0][0],
        password=PG_USAGE_SECRET_KEY,
        port=PG_USAGE_HOSTS[0][1]
    )
)

# legacy_category_list = [36, 6014, 7012, 6004, 6008]
unified_category_list = [800000, 800001, 800005, 800035, 800031]
legacy_category_list = [36]
category_mapping = {
    36: 800000,
    6014:800001,
    7012:800005,
    6004:800035,
    6008:800031
}

aggr_sql = """select  app_id, rank, kpi, estimate from plproxy.execute_select($proxy$
SELECT app_id, rank, kpi, estimate
FROM mu.category_monthly_2001_143441 where 
date in ('2020-01-31', '2020-02-29','2020-03-31') 
and rank <= 1000 
and category_id = {legacy_category_id} $proxy$)
 t (app_id bigint, rank integer, kpi smallint, estimate double precision) order by rank asc;
"""

aggr_sql_aa = """
SELECT app_id
    FROM plproxy.execute_select_nestloop($proxy$
        SELECT app_id, MAX(estimate) AS estimate
        FROM mu.category_weekly_2001_143441
        WHERE date BETWEEN '2020-08-21' AND '2020-08-30' AND category_id = 6014 AND rank <= 1000
        GROUP BY app_id
        ORDER BY MAX(estimate) DESC
        LIMIT 1000
    $proxy$) t (app_id BIGINT, estimate FLOAT8)
    GROUP BY app_id
    ORDER BY MAX(estimate) DESC, app_id ASC
    LIMIT 1000
"""

single_sql =  """select  app_id, rank, kpi, estimate from plproxy.execute_select($proxy$
SELECT app_id, rank, kpi, estimate
FROM mu.category_daily_2001_143441
where date = '2020-07-01'
and rank <= 1000 
and category_id = {legacy_category_id} $proxy$)
 t (app_id bigint, rank integer, kpi smallint, estimate double precision) order by rank asc;
"""

domain_single_dql = """
select  domain_id, rank_est_usage_penetration, est_usage_penetration from plproxy.execute_select($proxy$
SELECT domain_id, rank_est_usage_penetration, est_usage_penetration
FROM mw.category_m_ip_us where 
date = '2020-09-30' 
and rank_est_usage_penetration <= 1000 
and category_id = {unified_category_id} $proxy$)
 t (domain_id bigint, rank_est_usage_penetration integer, est_usage_penetration double precision) order by rank_est_usage_penetration asc;
""" 



def get_plproxy_result(sql_str):
    plproxy_result = []
    result = query(PLPROXY_DSN, sql_str)
    # print result
    # distinct_domain_id = result[1][0]
    # for _r in result:
    #     plproxy_result.append(_r[0])
        
    df_data = [Row(app_id=r[0], rank=r[1], kpi=r[2], estimate=r[3]) for r in result]
    # # print df_data[1]
    _schema =StructType([StructField("app_id", LongType(), False), 
    StructField("rank", IntegerType(), False),
    StructField("kpi", ShortType(), False),
    StructField("estimate", DoubleType(), False)])
    df_plproxy = spark.createDataFrame(data=df_data, schema=_schema)
    # df_plproxy.createOrReplaceTempView("plproxy_df_new")
    # spark.sql("select * from plproxy_df_new").show(10000, False)
    return df_plproxy
    # return df_plproxy
    
def get_unified_data():
    domain_unified_source_path = "s3://b2c-prod-data-pipeline-unified-mobileweb-paid/unified/mobileweb.basic.v4/fact/granularity=w/month=202005/date=2020-05-16"
    
    spark.read.format("delta").load(unified_source_path).createOrReplaceTempView("test_unified")
    spark.sql("select distinct domain_id from test_unified where  est_average_active_users <> 0 and est_average_active_users is not null order by domain_id asc").createOrReplaceTempView("unified_df_new")

def get_plproxy_data():
     df_plproxy=get_plproxy_result()
     df_plproxy.createOrReplaceTempView("plproxy_df_new")
     spark.sql("select count(distinct app_id) from plproxy_df_new").show(10, False)
    #  spark.sql("select * from plproxy_df_new order by rank asc").show(10000, False)
     
     spark.sql("select app_id, avg(estimate) as estimate from plproxy_df_new group by app_id order by estimate desc").show(10000, False)



def compare_single():
    
    for category in legacy_category_list:
        df_plproxy=get_plproxy_result(single_sql.format(legacy_category_id=category))
        df_plproxy.createOrReplaceTempView("plproxy_df")
        spark.sql("""
            select product_key as app_id 
            from basic_with_category_daily 
            where date ='2020-07-01'
            and device_code = 'ios-phone' 
            and country_code='US' 
            and unified_category_key={unified_category} 
            order by est_usage_penetration desc limit 1000
            """
            .format(unified_category=category_mapping[category])).createOrReplaceTempView("apps_new")
        spark.sql("""
            select app_id 
            from plproxy_df 
            except 
            select app_id 
            from apps_new""").createOrReplaceTempView("plproxy_diff_new")
        spark.sql("""
            select app_id 
            from apps_new 
            except 
            select app_id 
            from plproxy_df""").createOrReplaceTempView("new_diff_plproxy")
        
        print "category is {category}".format(category=category)
        # spark.sql("select app_id from plproxy_df").show(1000, False)
        # spark.sql("select app_id from apps_new").show(1000, False)
        
        spark.sql("""
            select count(1) as plproxy_diff_new 
            from plproxy_diff_new""").show(10, False)
        spark.sql("""
            select count(1) as new_diff_plproxy 
            from new_diff_plproxy""").show(10, False)
        # spark.sql("""
        #     select app_id as plproxy_diff_new 
        #     from plproxy_diff_new""").show(10, False)
        # spark.sql("""
        #     select app_id as new_diff_plproxy 
        #     from new_diff_plproxy""").show(10, False)


def compare_aggr():
    # basic_with_category_{granularity}; .format(date='2020-08-31', legacy_category_id=36)
    
    for category in legacy_category_list:
        df_plproxy=get_plproxy_result(aggr_sql.format(legacy_category_id=category))
        df_plproxy.createOrReplaceTempView("plproxy_df")
        spark.sql("""
            select app_id,
                max(estimate) as estimate 
            from plproxy_df 
            group by app_id 
            order by estimate desc 
            limit 1000""").createOrReplaceTempView("plproxy_aggr")
        
        spark.sql("""
        
            select product_key as app_id, 
                sum(est_active_users)/sum(est_population) as aggr_up 
            from basic_with_category_daily
            where date between '2020-01-31' and '2020-03-31'  
            and device_code = 'ios-phone'
            and granularity_code = 'monthly'
            and country_code = 'US'
            and product_type_code='app'
            and unified_category_key={unified_category} 
            group by app_id 
            order by aggr_up desc 
            limit 1000"""
            
            .format(unified_category=category_mapping[category])).createOrReplaceTempView("apps_new")
        spark.sql("select app_id from plproxy_aggr except select app_id from apps_new").createOrReplaceTempView("plproxy_diff_new")
        spark.sql("select app_id from apps_new except select app_id from plproxy_aggr").createOrReplaceTempView("new_diff_plproxy")
        print "category is {category}".format(category=category)
        spark.sql("select app_id,estimate from plproxy_aggr").show(1000, False)
        # spark.sql("select app_id from apps_new").show(1000, False)
        #spark.sql("select count(1) as plproxy_diff_new from plproxy_diff_new").show(10, False)
        #spark.sql("select count(1) as new_diff_plproxy from new_diff_plproxy").show(10, False)
        # # spark.sql("select app_id from plproxy_df order by app_id limit 10").show(10, False)
        # spark.sql("select app_id from apps_new order by app_id limit 10").show(10, False)
        
        # spark.sql("select app_id as plproxy_diff_new from plproxy_diff_new").show(10, False)
        # spark.sql("select app_id as new_diff_plproxy from new_diff_plproxy").show(10, False)
        
    
# compare_single()   
compare_aggr()
# unified_source_path = "s3://b2c-prod-data-pipeline-unified-mobileweb-paid/unified/mobileweb.basic.v4/fact/granularity=w/date=2020-05-16"
# spark.read.format("delta").load(unified_source_path).show(10)
