In [0]:

import pandas as pd
from pyspark.sql import functions as F
from datetime import datetime
from applications.db_check_v1.common.db_check_utils import query
from conf.settings import PG_USAGE_HOSTS, PG_USAGE_NAME, PG_USAGE_ACCESS_ID, PG_USAGE_SECRET_KEY, \
    CITUS_USAGE_NAME, CITUS_USAGE_ACCESS_ID, CITUS_USAGE_HOSTS, CITUS_USAGE_SECRET_KEY

PLPROXY_DSN = (
    "dbname='{db}' user='{user}' password='{password}' "
    "host='{host}' port='{port}'".format(
        db=PG_USAGE_NAME,
        user=PG_USAGE_ACCESS_ID,
        host=PG_USAGE_HOSTS[0][0],
        password=PG_USAGE_SECRET_KEY,
        port=PG_USAGE_HOSTS[0][1]
    )
)

def get_date_list(begin_date, end_date, freq):
    date_list = [x.strftime('%Y-%m-%d') for x in list(pd.date_range(start=begin_date, end=end_date, freq=freq))]
    return date_list


begin_date = datetime(2020, 1, 01)
end_date = datetime(2020, 3, 31)

DATE_GRANULARITY_MAPPINGLIST = {
    "daily": get_date_list(begin_date, end_date, "D"),
    "weekly": get_date_list(begin_date, end_date, "W-SAT"),
    "monthly": get_date_list(begin_date, end_date, "M")
}


DATE_GRANULARITY_MAPPINGLIST["monthly"].reverse()
DATE_GRANULARITY_MAPPINGLIST["weekly"].reverse()
DATE_GRANULARITY_MAPPINGLIST["daily"].reverse()

print DATE_GRANULARITY_MAPPINGLIST["monthly"]
print DATE_GRANULARITY_MAPPINGLIST["weekly"]
print DATE_GRANULARITY_MAPPINGLIST["daily"]

In [0]:

granularity_list = ["monthly", "weekly", "daily"]
category_path = "s3://b2c-prod-data-pipeline-unified-usage/unified/usage.category.v6/dimension/product_type_code=app/"
usage_basic_after_transform = "s3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v6/fact/product_type_code=app/"
for granularity in granularity_list:
    filter_str = "date between '2020-01-01' and '2020-03-31' and granularity_code = '{granularity}'".format(granularity=granularity)
    print filter_str
    spark.read.format("delta").load(category_path).filter(filter_str).createOrReplaceTempView("category_view")
    spark.read.format("delta").load(usage_basic_after_transform).filter(filter_str).createOrReplaceTempView("basic_view")
    spark.sql("""
    select basic.DATE_KEY,
        basic.DEVICE_CODE,
        basic.COUNTRY_CODE,basic.COUNTRY_KEY,
        basic.date,
        basic.DEVICE_KEY,basic.GRANULARITY_CODE,basic.GRANULARITY_KEY,basic.MARKET_CODE,
        basic.MARKET_KEY,basic.PRODUCT_KEY,basic.PRODUCT_TYPE_CODE,basic.PRODUCT_TYPE_KEY,
        basic.EST_TOTAL_TIME_MILLISECONDS_OF_MAIN_CATEGORY,basic.EST_WIFI_BYTES,basic.EST_ACTIVE_USERS,
        basic.EST_TOTAL_BYTES,basic.EST_TOTAL_SESSION_COUNT_OF_MAIN_CATEGORY,
        basic.EST_TOTAL_BYTES_OF_MAIN_CATEGORY,basic.EST_TOTAL_ACTIVE_DAYS,basic.EST_INSTALL_BASE,
        basic.EST_POPULATION,basic.EST_TOTAL_TIME_MILLISECONDS,basic.EST_TOTAL_SESSION_COUNT,
        basic.DEVICE_FORM_FACTOR_CODE,basic.DEVICE_FORM_FACTOR_KEY,basic.PARENT_DEVICE_CODE,basic.PARENT_DEVICE_KEY,basic.PLATFORM_CODE,basic.PLATFORM_KEY,
        basic.est_usage_penetration,
        map.category_key as category_key,
        map.unified_category_key as unified_category_key
    from basic_view basic
    join category_view map
    on basic.date = map.date
    and basic.product_key = map.product_key
    and basic.granularity_code = map.granularity_code
    and basic.country_code = map.country_code
    and basic.device_code = map.device_code
    """).createOrReplaceTempView("basic_with_category_{granularity}".format(granularity=granularity))
    # spark.sql("select count(1) from category_view").show(1, False)
    # spark.sql("select count(1) from basic_with_category_{granularity}".format(granularity=granularity)).show(1, False)
    


In [0]:

spark.sql("""select product_key,est_population from basic_with_category_weekly  
            where date in  ('2020-09-05', '2020-09-12') 
            and device_code = 'ios-phone' 
            and country_code='US' 
            and unified_category_key=800000
            ORDER by est_population DESC
           """).show(10, False)

In [0]:

import pandas as pd
from pyspark.sql import functions as F
from datetime import datetime
from applications.db_check_v1.common.db_check_utils import query
from pyspark.sql import Row
from conf.settings import PG_USAGE_HOSTS, PG_USAGE_NAME, PG_USAGE_ACCESS_ID, PG_USAGE_SECRET_KEY, \
    CITUS_USAGE_NAME, CITUS_USAGE_ACCESS_ID, CITUS_USAGE_HOSTS, CITUS_USAGE_SECRET_KEY
from pyspark.sql.types import StructType, StructField, LongType, IntegerType, DoubleType, ShortType

PLPROXY_DSN = (
    "dbname='{db}' user='{user}' password='{password}' "
    "host='{host}' port='{port}'".format(
        db=PG_USAGE_NAME,
        user=PG_USAGE_ACCESS_ID,
        host=PG_USAGE_HOSTS[0][0],
        password=PG_USAGE_SECRET_KEY,
        port=PG_USAGE_HOSTS[0][1]
    )
)

# legacy_category_list = [36, 6014, 7012, 6004, 6008]
unified_category_list = [800000, 800001, 800005, 800035, 800031]
legacy_category_list = [6016]
category_mapping = {
    36: 800000,
    6014:800001,
    7012:800005,
    6004:800035,
    6008:800031,
    6016:800022
}

aggr_sql = """select  app_id, rank, kpi, estimate from plproxy.execute_select($proxy$
SELECT app_id, rank, kpi, estimate
FROM mu.category_monthly_2001_143441 where 
date in ('2020-06-30', '2020-07-31') 
and rank <= 1000 
and category_id = {legacy_category_id} $proxy$)
 t (app_id bigint, rank integer, kpi smallint, estimate double precision) order by rank asc;
"""

aggr_sql_aa = """
SELECT app_id
    FROM plproxy.execute_select_nestloop($proxy$
        SELECT app_id, MAX(estimate) AS estimate
        FROM mu.category_weekly_2001_143441
        WHERE date BETWEEN '2020-08-21' AND '2020-08-30' AND category_id = 6014 AND rank <= 1000
        GROUP BY app_id
        ORDER BY MAX(estimate) DESC
        LIMIT 1000
    $proxy$) t (app_id BIGINT, estimate FLOAT8)
    GROUP BY app_id
    ORDER BY MAX(estimate) DESC, app_id ASC
    LIMIT 1000
"""

single_sql =  """select  app_id, rank, kpi, estimate from plproxy.execute_select($proxy$
SELECT app_id, rank, kpi, estimate
FROM mu.category_daily_2001_143441
where date = '2020-07-01'
and rank <= 1000 
and category_id = {legacy_category_id} $proxy$)
 t (app_id bigint, rank integer, kpi smallint, estimate double precision) order by rank asc;
"""

domain_single_dql = """
select  domain_id, rank_est_usage_penetration, est_usage_penetration from plproxy.execute_select($proxy$
SELECT domain_id, rank_est_usage_penetration, est_usage_penetration
FROM mw.category_m_ip_us where 
date = '2020-09-30' 
and rank_est_usage_penetration <= 1000 
and category_id = {unified_category_id} $proxy$)
 t (domain_id bigint, rank_est_usage_penetration integer, est_usage_penetration double precision) order by rank_est_usage_penetration asc;
""" 



def get_plproxy_result(sql_str):
    plproxy_result = []
    result = query(PLPROXY_DSN, sql_str)
    # print result
    # distinct_domain_id = result[1][0]
    # for _r in result:
    #     plproxy_result.append(_r[0])
        
    df_data = [Row(app_id=r[0], rank=r[1], kpi=r[2], estimate=r[3]) for r in result]
    # # print df_data[1]
    _schema =StructType([StructField("app_id", LongType(), False), 
    StructField("rank", IntegerType(), False),
    StructField("kpi", ShortType(), False),
    StructField("estimate", DoubleType(), False)])
    df_plproxy = spark.createDataFrame(data=df_data, schema=_schema)
    # df_plproxy.createOrReplaceTempView("plproxy_df_new")
    # spark.sql("select * from plproxy_df_new").show(10000, False)
    return df_plproxy
    # return df_plproxy
    
def get_unified_data():
    domain_unified_source_path = "s3://b2c-prod-data-pipeline-unified-mobileweb-paid/unified/mobileweb.basic.v4/fact/granularity=w/month=202005/date=2020-05-16"
    
    spark.read.format("delta").load(unified_source_path).createOrReplaceTempView("test_unified")
    spark.sql("select distinct domain_id from test_unified where  est_average_active_users <> 0 and est_average_active_users is not null order by domain_id asc").createOrReplaceTempView("unified_df_new")

def get_plproxy_data():
     df_plproxy=get_plproxy_result()
     df_plproxy.createOrReplaceTempView("plproxy_df_new")
     spark.sql("select count(distinct app_id) from plproxy_df_new").show(10, False)
    #  spark.sql("select * from plproxy_df_new order by rank asc").show(10000, False)
     
     spark.sql("select app_id, avg(estimate) as estimate from plproxy_df_new group by app_id order by estimate desc").show(10000, False)



def compare_single():
    
    for category in legacy_category_list:
        df_plproxy=get_plproxy_result(single_sql.format(legacy_category_id=category))
        df_plproxy.createOrReplaceTempView("plproxy_df")
        spark.sql("""
            select product_key as app_id 
            from basic_with_category_daily 
            where date ='2020-07-01'
            and device_code = 'ios-phone' 
            and country_code='US' 
            and unified_category_key={unified_category} 
            order by est_usage_penetration desc limit 1000
            """
            .format(unified_category=category_mapping[category])).createOrReplaceTempView("apps_new")
        spark.sql("""
            select app_id 
            from plproxy_df 
            except 
            select app_id 
            from apps_new""").createOrReplaceTempView("plproxy_diff_new")
        spark.sql("""
            select app_id 
            from apps_new 
            except 
            select app_id 
            from plproxy_df""").createOrReplaceTempView("new_diff_plproxy")
        
        print "category is {category}".format(category=category)
        # spark.sql("select app_id from plproxy_df").show(1000, False)
        # spark.sql("select app_id from apps_new").show(1000, False)
        
        spark.sql("""
            select count(1) as plproxy_diff_new 
            from plproxy_diff_new""").show(10, False)
        spark.sql("""
            select count(1) as new_diff_plproxy 
            from new_diff_plproxy""").show(10, False)
        # spark.sql("""
        #     select app_id as plproxy_diff_new 
        #     from plproxy_diff_new""").show(10, False)
        # spark.sql("""
        #     select app_id as new_diff_plproxy 
        #     from new_diff_plproxy""").show(10, False)


def compare_aggr():
    # basic_with_category_{granularity}; .format(date='2020-08-31', legacy_category_id=36)
    
    for category in legacy_category_list:
        df_plproxy=get_plproxy_result(aggr_sql.format(legacy_category_id=category))
        df_plproxy.createOrReplaceTempView("plproxy_df")
        spark.sql("""
            select app_id,
                max(estimate) as estimate 
            from plproxy_df 
            group by app_id 
            order by estimate desc 
            limit 1000""").createOrReplaceTempView("plproxy_aggr")
        
        spark.sql("""
        
            select product_key as app_id, 
                sum(est_active_users)/sum(est_population) as aggr_up 
            from basic_with_category_monthly
            where date between '2020-08-31' and '2020-03-31'  
            and device_code = 'ios-phone'
            and granularity_code = 'monthly'
            and country_code = 'US'
            and product_type_code='app'
            and unified_category_key={unified_category} 
            group by app_id 
            order by aggr_up desc 
            limit 1000"""
            
            .format(unified_category=category_mapping[category])).createOrReplaceTempView("apps_new")
        # spark.sql("select app_id from plproxy_aggr except select app_id from apps_new").createOrReplaceTempView("plproxy_diff_new")
        # spark.sql("select app_id from apps_new except select app_id from plproxy_aggr").createOrReplaceTempView("new_diff_plproxy")
        # print "category is {category}".format(category=category)
        spark.sql("select app_id,estimate from plproxy_aggr").show(1000, False)
        # spark.sql("select app_id from apps_new").show(1000, False)
        # spark.sql("select count(1) as plproxy_diff_new from plproxy_diff_new").show(10, False)
        # spark.sql("select count(1) as new_diff_plproxy from new_diff_plproxy").show(10, False)
        # # spark.sql("select app_id from plproxy_df order by app_id limit 10").show(10, False)
        # spark.sql("select app_id from apps_new order by app_id limit 10").show(10, False)
        
        # spark.sql("select app_id as plproxy_diff_new from plproxy_diff_new").show(10, False)
        # spark.sql("select app_id as new_diff_plproxy from new_diff_plproxy").show(11, False)
        
    
# compare_single()   
compare_aggr()
# unified_source_path = "s3://b2c-prod-data-pipeline-unified-mobileweb-paid/unified/mobileweb.basic.v4/fact/granularity=w/date=2020-05-16"
# spark.read.format("delta").load(unified_source_path).show(10)


In [0]:

import pandas as pd
from pyspark.sql import functions as F
from datetime import datetime
from applications.db_check_v1.common.db_check_utils import query
from pyspark.sql import Row
from conf.settings import PG_USAGE_HOSTS, PG_USAGE_NAME, PG_USAGE_ACCESS_ID, PG_USAGE_SECRET_KEY, \
    CITUS_USAGE_NAME, CITUS_USAGE_ACCESS_ID, CITUS_USAGE_HOSTS, CITUS_USAGE_SECRET_KEY
from pyspark.sql.types import StructType, StructField, LongType, IntegerType, DoubleType, ShortType

PLPROXY_DSN = (
    "dbname='{db}' user='{user}' password='{password}' "
    "host='{host}' port='{port}'".format(
        db=PG_USAGE_NAME,
        user=PG_USAGE_ACCESS_ID,
        host=PG_USAGE_HOSTS[0][0],
        password=PG_USAGE_SECRET_KEY,
        port=PG_USAGE_HOSTS[0][1]
    )
)

# legacy_category_list = [36, 6014, 7012, 6004, 6008]
unified_category_list = [800000, 800001, 800005, 800035, 800031]
legacy_category_list = [36]
category_mapping = {
    36: 800000,
    6014:800001,
    7012:800005,
    6004:800035,
    6008:800031
}

aggr_sql = """select  app_id, rank, kpi, estimate from plproxy.execute_select($proxy$
SELECT app_id, rank, kpi, estimate
FROM mu.category_monthly_2001_143441 where 
date in ('2020-01-31', '2020-03-31') 
and rank <= 1000 
and category_id = {legacy_category_id} $proxy$)
 t (app_id bigint, rank integer, kpi smallint, estimate double precision) order by rank asc;
"""

aggr_sql_aa = """
SELECT app_id
    FROM plproxy.execute_select_nestloop($proxy$
        SELECT app_id, MAX(estimate) AS estimate
        FROM mu.category_weekly_2001_143441
        WHERE date BETWEEN '2020-08-21' AND '2020-08-30' AND category_id = 6014 AND rank <= 1000
        GROUP BY app_id
        ORDER BY MAX(estimate) DESC
        LIMIT 1000
    $proxy$) t (app_id BIGINT, estimate FLOAT8)
    GROUP BY app_id
    ORDER BY MAX(estimate) DESC, app_id ASC
    LIMIT 1000
"""

single_sql =  """select  app_id, rank, kpi, estimate from plproxy.execute_select($proxy$
SELECT app_id, rank, kpi, estimate
FROM mu.category_daily_2001_143441
where date = '2020-07-01'
and rank <= 1000 
and category_id = {legacy_category_id} $proxy$)
 t (app_id bigint, rank integer, kpi smallint, estimate double precision) order by rank asc;
"""

domain_single_dql = """
select  domain_id, rank_est_usage_penetration, est_usage_penetration from plproxy.execute_select($proxy$
SELECT domain_id, rank_est_usage_penetration, est_usage_penetration
FROM mw.category_m_ip_us where 
date = '2020-09-30' 
and rank_est_usage_penetration <= 1000 
and category_id = {unified_category_id} $proxy$)
 t (domain_id bigint, rank_est_usage_penetration integer, est_usage_penetration double precision) order by rank_est_usage_penetration asc;
""" 



def get_plproxy_result(sql_str):
    plproxy_result = []
    result = query(PLPROXY_DSN, sql_str)
    # print result
    # distinct_domain_id = result[1][0]
    # for _r in result:
    #     plproxy_result.append(_r[0])
        
    df_data = [Row(app_id=r[0], rank=r[1], kpi=r[2], estimate=r[3]) for r in result]
    # # print df_data[1]
    _schema =StructType([StructField("app_id", LongType(), False), 
    StructField("rank", IntegerType(), False),
    StructField("kpi", ShortType(), False),
    StructField("estimate", DoubleType(), False)])
    df_plproxy = spark.createDataFrame(data=df_data, schema=_schema)
    # df_plproxy.createOrReplaceTempView("plproxy_df_new")
    # spark.sql("select * from plproxy_df_new").show(10000, False)
    return df_plproxy
    # return df_plproxy
    
def get_unified_data():
    domain_unified_source_path = "s3://b2c-prod-data-pipeline-unified-mobileweb-paid/unified/mobileweb.basic.v4/fact/granularity=w/month=202005/date=2020-05-16"
    
    spark.read.format("delta").load(unified_source_path).createOrReplaceTempView("test_unified")
    spark.sql("select distinct domain_id from test_unified where  est_average_active_users <> 0 and est_average_active_users is not null order by domain_id asc").createOrReplaceTempView("unified_df_new")

def get_plproxy_data():
     df_plproxy=get_plproxy_result()
     df_plproxy.createOrReplaceTempView("plproxy_df_new")
     spark.sql("select count(distinct app_id) from plproxy_df_new").show(10, False)
    #  spark.sql("select * from plproxy_df_new order by rank asc").show(10000, False)
     
     spark.sql("select app_id, avg(estimate) as estimate from plproxy_df_new group by app_id order by estimate desc").show(10000, False)



def compare_single():
    
    for category in legacy_category_list:
        df_plproxy=get_plproxy_result(single_sql.format(legacy_category_id=category))
        df_plproxy.createOrReplaceTempView("plproxy_df")
        spark.sql("""
            select product_key as app_id 
            from basic_with_category_daily 
            where date ='2020-07-01'
            and device_code = 'ios-phone' 
            and country_code='US' 
            and unified_category_key={unified_category} 
            order by est_usage_penetration desc limit 1000
            """
            .format(unified_category=category_mapping[category])).createOrReplaceTempView("apps_new")
        spark.sql("""
            select app_id 
            from plproxy_df 
            except 
            select app_id 
            from apps_new""").createOrReplaceTempView("plproxy_diff_new")
        spark.sql("""
            select app_id 
            from apps_new 
            except 
            select app_id 
            from plproxy_df""").createOrReplaceTempView("new_diff_plproxy")
        
        print "category is {category}".format(category=category)
        # spark.sql("select app_id from plproxy_df").show(1000, False)
        # spark.sql("select app_id from apps_new").show(1000, False)
        
        spark.sql("""
            select count(1) as plproxy_diff_new 
            from plproxy_diff_new""").show(10, False)
        spark.sql("""
            select count(1) as new_diff_plproxy 
            from new_diff_plproxy""").show(10, False)
        # spark.sql("""
        #     select app_id as plproxy_diff_new 
        #     from plproxy_diff_new""").show(10, False)
        # spark.sql("""
        #     select app_id as new_diff_plproxy 
        #     from new_diff_plproxy""").show(10, False)
from aadatapipelinecore.core.loader.snowflake import read

sql = """
    select date, sum(est_referral_total_time_milliseconds) as tt
    from "AA_INTELLIGENCE_PRODUCTION"."ADL_USAGE_PAID"."FACT_USAGE_DOMAIN_REFERRAL_V1_CLUSTER_BY_DATE_COUNTRY_CODE"
    group by date order by date
"""
urn = Urn(
    identifier=atomic_id(),
    manipulation=ManipulationType.QUERY,
    namespace="aa.usage.domain-referral.v6",
    event=EventType.LOAD,
    owner='app_tech'
)
storage_definition = {
    "database": "udw",
    "schema": "adl_usage_paid",
    "fields": [
        {"name": "date", "type": "date"},
        {"name": "sf_tt", "type": "double"},
    ]
}
sf_df = read(urn, storage_definition, sql, "usage qa", spark)
sf_df.show(100, truncate=False)        

def compare_aggr():
    # basic_with_category_{granularity}; .format(date='2020-08-31', legacy_category_id=36)
    
    for category in legacy_category_list:
        df_plproxy=get_plproxy_result(aggr_sql.format(legacy_category_id=category))
        df_plproxy.createOrReplaceTempView("plproxy_df")
        spark.sql("""
            select app_id,
                max(estimate) as estimate 
            from plproxy_df 
            group by app_id 
            order by estimate desc 
            limit 1000""").createOrReplaceTempView("plproxy_aggr")
        
        spark.sql("""
            select product_key as app_id, 
                sum(est_active_users)/sum(est_population) as aggr_up 
            from basic_with_category_monthly
            where date in  ('2020-01-31', '2020-03-31') 
            and device_code = 'ios-phone' 
            and country_code='US' 
            and unified_category_key={unified_category} 
            group by app_id 
            order by aggr_up desc 
            limit 1000"""
            .format(unified_category=category_mapping[category])).createOrReplaceTempView("apps_new")
        spark.sql("select app_id from plproxy_aggr except select app_id from apps_new").createOrReplaceTempView("plproxy_diff_new")
        spark.sql("select app_id from apps_new except select app_id from plproxy_aggr").createOrReplaceTempView("new_diff_plproxy")
        print "category is {category}".format(category=category)
        # spark.sql("select app_id from plproxy_aggr").show(1000, False)
        # spark.sql("select app_id from apps_new").show(1000, False)
        spark.sql("select count(1) as plproxy_diff_new from plproxy_diff_new").show(10, False)
        spark.sql("select count(1) as new_diff_plproxy from new_diff_plproxy").show(10, False)
        # # spark.sql("select app_id from plproxy_df order by app_id limit 10").show(10, False)
        # spark.sql("select app_id from apps_new order by app_id limit 10").show(10, False)
        
        # spark.sql("select app_id as plproxy_diff_new from plproxy_diff_new").show(10, False)
        # spark.sql("select app_id as new_diff_plproxy from new_diff_plproxy").show(10, False)
        
    
# compare_single()   
compare_aggr()
# unified_source_path = "s3://b2c-prod-data-pipeline-unified-mobileweb-paid/unified/mobileweb.basic.v4/fact/granularity=w/date=2020-05-16"
# spark.read.format("delta").load(unified_source_path).show(10)


In [0]:

import pandas as pd
from pyspark.sql import functions as F
from datetime import datetime
from applications.db_check_v1.common.db_check_utils import query
from pyspark.sql import Row
from conf.settings import PG_USAGE_HOSTS, PG_USAGE_NAME, PG_USAGE_ACCESS_ID, PG_USAGE_SECRET_KEY, \
    CITUS_USAGE_NAME, CITUS_USAGE_ACCESS_ID, CITUS_USAGE_HOSTS, CITUS_USAGE_SECRET_KEY
from pyspark.sql.types import StructType, StructField, LongType, IntegerType, DoubleType, ShortType,FloatType

PLPROXY_DSN = (
    "dbname='{db}' user='{user}' password='{password}' "
    "host='{host}' port='{port}'".format(
        db=PG_USAGE_NAME,
        user=PG_USAGE_ACCESS_ID,
        host=PG_USAGE_HOSTS[0][0],
        password=PG_USAGE_SECRET_KEY,
        port=PG_USAGE_HOSTS[0][1]
    )
)

# legacy_category_list = [36, 6014, 7012, 6004, 6008]
unified_category_list = [800000, 800001, 800005, 800035, 800031]
# legacy_category_list = [6014]
# unified_category_list = [800000]
category_mapping = {
    36: 800000,
    6014:800001,
    7012:800005,
    6004:800035,
    6008:800031
}

aggr_sql = """select  app_id, rank, kpi, estimate from plproxy.execute_select($proxy$
SELECT app_id, rank, kpi, estimate
FROM mu.category_monthly_2001_143441 where 
date in ('2020-08-31', '2020-09-30') 
and rank <= 1000 
and category_id = {legacy_category_id} $proxy$)
 t (app_id bigint, rank integer, kpi smallint, estimate double precision) order by rank asc;
"""

aggr_sql_aa = """
SELECT app_id
    FROM plproxy.execute_select_nestloop($proxy$
        SELECT app_id, MAX(estimate) AS estimate
        FROM mu.category_weekly_2001_143441
        WHERE date BETWEEN '2020-08-21' AND '2020-08-30' AND category_id = 6014 AND rank <= 1000
        GROUP BY app_id
        ORDER BY MAX(estimate) DESC
        LIMIT 1000
    $proxy$) t (app_id BIGINT, estimate FLOAT8)
    GROUP BY app_id
    ORDER BY MAX(estimate) DESC, app_id ASC
    LIMIT 1000
"""

single_sql =  """select  app_id, rank, kpi, estimate from plproxy.execute_select($proxy$
SELECT app_id, rank, kpi, estimate
FROM mu.category_monthly_2001_143441
where date = '2020-08-31'
and rank <= 1000 
and category_id = {legacy_category_id} $proxy$)
 t (app_id bigint, rank integer, kpi smallint, estimate double precision) order by rank asc;
"""

domain_single_dql = """
select  domain_id, rank_est_usage_penetration, est_usage_penetration from plproxy.execute_select($proxy$
SELECT domain_id, rank_est_usage_penetration, est_usage_penetration
FROM mw.category_w_ip_us where 
date = '2020-09-12' 
and rank_est_usage_penetration <= 1000 
and category_id = {unified_category_id} $proxy$)
 t (domain_id bigint, rank_est_usage_penetration integer, est_usage_penetration real) order by rank_est_usage_penetration asc;
""" 
domain_aggr_sql = """
select domain_id, rank_est_usage_penetration, est_usage_penetration from plproxy.execute_select($proxy$
SELECT domain_id, rank_est_usage_penetration, est_usage_penetration
FROM mw.category_d_ip_us where 
date in ('2020-09-01', '2020-09-02') 
and rank_est_usage_penetration <= 1000 
and category_id = {unified_category_id} $proxy$)
 t (domain_id bigint, rank_est_usage_penetration integer, est_usage_penetration real) order by rank_est_usage_penetration asc;
"""


def get_plproxy_result(sql_str):
    plproxy_result = []
    result = query(PLPROXY_DSN, sql_str)
    # print result
    # distinct_domain_id = result[1][0]
    # for _r in result:
    #     plproxy_result.append(_r[0])
        
    df_data = [Row(domain_id=r[0], rank_est_usage_penetration=r[1], est_usage_penetration=r[2]) for r in result]
    # # print df_data[1]
    _schema =StructType([StructField("domain_id", LongType(), False), 
    StructField("rank_est_usage_penetration", IntegerType(), False),
    StructField("est_usage_penetration", FloatType(), False)])
    df_plproxy = spark.createDataFrame(data=df_data, schema=_schema)
    # df_plproxy.createOrReplaceTempView("plproxy_df_new")
    # spark.sql("select * from plproxy_df_new").show(10000, False)
    return df_plproxy
    # return df_plproxy
    
def get_unified_data():
    domain_unified_source_path = "s3://b2c-prod-data-pipeline-unified-mobileweb-paid/unified/mobileweb.basic.v4/fact/granularity=w/month=202005/date=2020-05-16"
    
    spark.read.format("delta").load(unified_source_path).createOrReplaceTempView("test_unified")
    spark.sql("select distinct domain_id from test_unified where  est_average_active_users <> 0 and est_average_active_users is not null order by domain_id asc").createOrReplaceTempView("unified_df_new")

# def get_plproxy_data():
#      df_plproxy=get_plproxy_result()
#      df_plproxy.createOrReplaceTempView("plproxy_df_new")
#      spark.sql("select count(distinct app_id) from plproxy_df_new").show(10, False)
#     #  spark.sql("select * from plproxy_df_new order by rank asc").show(10000, False)
     
#      spark.sql("select app_id, avg(estimate) as estimate from plproxy_df_new group by app_id order by estimate desc").show(10000, False)



def compare_single():
    
    for category in unified_category_list:
        df_plproxy=get_plproxy_result(domain_single_dql.format(unified_category_id=category))
        df_plproxy.createOrReplaceTempView("plproxy_df")
        
        spark.sql("""
            select product_key as domain_id 
            from basic_with_category_weekly
            where date ='2020-09-12'
            and device_code = 'ios-phone' 
            and country_code='US' 
            and unified_category_key={unified_category} 
            order by est_usage_penetration desc limit 1000
            """.format(unified_category=category)).createOrReplaceTempView("apps_new")
        
        spark.sql("""
            select domain_id 
            from plproxy_df 
            except 
            select domain_id 
            from apps_new""").createOrReplaceTempView("plproxy_diff_new")
        spark.sql("""
            select domain_id 
            from apps_new 
            except 
            select domain_id 
            from plproxy_df""").createOrReplaceTempView("new_diff_plproxy")
        
        print "category is {category}".format(category=category)
        # spark.sql("select domain_id from plproxy_df").show(1000, False)
        # spark.sql("select domain_id from apps_new").show(1000, False)
        
        spark.sql("""
            select count(1) as plproxy_diff_new 
            from plproxy_diff_new""").show(10, False)
        spark.sql("""
            select count(1) as new_diff_plproxy 
            from new_diff_plproxy""").show(10, False)
        # spark.sql("""
        #     select app_id as plproxy_diff_new 
        #     from plproxy_diff_new""").show(10, False)
        # spark.sql("""
        #     select app_id as new_diff_plproxy 
        #     from new_diff_plproxy""").show(10, False)
        

def compare_aggr():
    # basic_with_category_{granularity}; .format(date='2020-08-31', legacy_category_id=36)
    
    for category in unified_category_list:
        df_plproxy=get_plproxy_result(domain_aggr_sql.format(unified_category_id=category))
        df_plproxy.createOrReplaceTempView("plproxy_df")
        spark.sql("""
            select domain_id,
                max(est_usage_penetration) as est_usage_penetration 
            from plproxy_df 
            group by domain_id 
            order by est_usage_penetration desc 
            limit 1000""").createOrReplaceTempView("plproxy_aggr")
        
        spark.sql("""
            select product_key as domain_id, 
                sum(est_active_users)/sum(est_population) AS aggre_up
            from basic_with_category_daily
            where date in  ('2020-09-01', '2020-09-02') 
            and device_code = 'ios-phone' 
            and country_code='US' 
            and unified_category_key={unified_category} 
            group by domain_id 
            order by sum_au desc 
            limit 1000"""
            .format(unified_category=category)).createOrReplaceTempView("apps_new")
       
        spark.sql("select domain_id from plproxy_aggr except select domain_id from apps_new").createOrReplaceTempView("plproxy_diff_new")
        spark.sql("select domain_id from apps_new except select domain_id from plproxy_aggr").createOrReplaceTempView("new_diff_plproxy")
        print "category is {category}".format(category=category)
        # spark.sql("select domain_id, sum_au,sum_pop from apps_new").show(1000, False)
        # spark.sql("select domain_id from plproxy_aggr").show(1000, False)
        # spark.sql("select domain_id from apps_new").show(1000, False)
        spark.sql("select count(1) as plproxy_diff_new from plproxy_diff_new").show(10, False)
        spark.sql("select count(1) as new_diff_plproxy from new_diff_plproxy").show(10, False)
        # # spark.sql("select app_id from plproxy_df order by app_id limit 10").show(10, False)
        # spark.sql("select app_id from apps_new order by app_id limit 10").show(10, False)
        
        # spark.sql("select app_id as plproxy_diff_new from plproxy_diff_new").show(10, False)
        # spark.sql("select app_id as new_diff_plproxy from new_diff_plproxy").show(10, False)
        
    
# compare_single()   
compare_aggr()
# unified_source_path = "s3://b2c-prod-data-pipeline-unified-mobileweb-paid/unified/mobileweb.basic.v4/fact/granularity=w/date=2020-05-16"
# spark.read.format("delta").load(unified_source_path).show(10)

In [0]:

list_plproxy=[100,
118,
103,
114,
104,
110,
116,
115,
124,
111,
105,
544007664,
102,
122,
121,
284882215,
112,
101,
915056765,
454638411,
127,
585027354,
297606951,
447188370,
422689480,
389801252,
363590051,
324684580,
429047995,
333903271,
835599320,
284815942,
107,
117,
284035177,
113,
310633997,
535886823,
123,
295646461,
525463029,
119,
126,
338137227,
962194608,
109,
351727428,
323229106,
519817714,
842842640,
1446075923,
282614216,
951937596,
376510438,
364709193,
284910350,
368677368,
288429040,
298867247,
331177714,
384830320,
719972451,
284993459,
283646709,
1232058109,
336353151,
545519333,
317469184,
924620788,
407558537,
711923939,
297430070,
507874739,
1064216828,
1132762804,
985746746,
1069511734,
944011620,
282935706,
529379082,
414706506,
284847138,
530621395,
868077558,
431946152,
375380948,
462638897,
392796698,
335364882,
310738695,
510855668,
125,
311548709,
1069509450,
522826277,
640360962,
1110145103,
1174078549,
352683833,
416023011,
587366035,
909319292,
926252661,
500003565,
300048137,
302584613,
1058959277,
1462082664,
1287282214,
309172177,
842849113,
974748812,
1094591345,
379693831,
1196764367,
922103212,
281940292,
547702041,
586447913,
327630330,
460177396,
1096918571,
643496868,
361309726,
736179781,
1374403536,
470412147,
290638154,
367623543,
302920553,
570060128,
894546091,
342792525,
1207472156,
482066631,
106,
406719683,
341232718,
1017492454,
301724680,
919087726,
468996152,
322439990,
477128284,
304878510,
488818252,
998754894,
588013838,
338010821,
331786748,
293622097,
401626263,
553834731,
403901186,
986339882,
1261357853,
512393983,
680819774,
309735670,
479516143,
408709785,
1038369065,
912561374,
361304891,
1065781769,
382952264,
836767708,
307906541,
1113153706,
436491861,
305343404,
389781154,
930441707,
552602056,
490217893,
362348516,
464988855,
344542975,
386678211,
938003185,
879478102,
579581125,
480883488,
300704847,
1477841973,
1486214495,
535509415,
591560124,
973741088,
317951436,
342527639,
395545555,
472014516,
512939461,
971023427,
422663827,
1293634699,
529479190,
361285480,
388491656,
382698565,
336698281,
1176027022,
545599256,
444553167,
1223471316,
1384542200,
477537958,
886445756,
945416273,
410896080,
1053012308,
1482766542,
664939913,
967351793,
1495369374,
1486931468,
427916203,
546473125,
1493125671,
723134859,
983156458,
777875529,
878577184,
1453989822,
381471023,
1182474649,
357218860,
284862083,
1487968838,
1010729050,
485357017,
776010987,
1494449873,
1492451796,
1195621598,
286058814,
449945214,
1443446174,
731629156,
618783545,
1056813463,
621574163,
331763096,
443904275,
936971630,
382497397,
656971078,
312325565,
296581815,
1288723196,
436672029,
458734623,
284876795,
457954781,
559887125,
306310789,
448639966,
1407852246,
367003839,
1149449468,
1500010832,
405383140,
414012602,
314855255,
406889139,
571800810,
469284907,
321560858,
318698524,
309465525,
339532909,
469337564,
300238550,
382617920,
1193350206,
1487932387,
486154808,
904052407,
414113282,
349731802,
555376968,
1345968745,
1013231476,
1179915619,
543186831,
1423046460,
329913454,
1095459556,
414478124,
989804926,
896130944,
1477636326,
638323895,
1149994032,
307184892,
1490384223,
1087101090,
1105855019,
429610587,
1285713171,
901941015,
561930308,
368494609,
664575829,
308750436,
1452526406,
359917414,
1457853413,
1498901771,
1413942319,
341036067,
328412701,
1022831885,
1497873581,
418075935,
1491328118,
1076402606,
1481293953,
1128712763,
646100661,
1131203560,
509199715,
1389111413,
1480236789,
657189652,
328415391,
389638243,
502912815,
355554941,
307386350,
883324671,
844570015,
388627783,
850417475,
293523031,
1218465964,
940247939,
303113127,
549643634,
297368629,
1487720650,
521487551,
434893913,
663592361,
530168168,
393328150,
418987775,
456034437,
1475887491,
1496150467,
370811491,
651510680,
595287172,
336377331,
455004730,
749133753,
450722833,
348177453,
1498982970,
285692706,
1193508329,
1014164514,
1053285387,
947984433,
327228455,
491730359,
1440430680,
1479551182,
1367363988,
1490423596,
490179405,
442839435,
487285735,
314716233,
930574573,
1482395931,
1178765645,
1491206418,
711410889,
635150066,
867827909,
1205990992,
456282559,
411766326,
921765888,
1264782561,
1216396545,
1462556579,
399452287,
1344700142,
320029256,
1038653883,
330376830,
393048976,
521207075,
919745844,
331308914,
725097967,
986999874,
878783582,
352969997,
364376344,
1490632518,
890378044,
1474046667,
561311442,
593715088,
1046510029,
880047117,
288487321,
561625752,
1484932356,
467738064,
491126018,
1459289784,
1330123889,
324715238,
855627886,
284235722,
1111876388,
454607051,
493619333,
997362197,
955286870,
324613447,
1220976145,
527588389,
1181632589,
540518599,
1453411110,
589698942,
942608209,
1492913398,
880178264,
1080248000,
924373886,
925338276,
966758561,
1144258115,
514643583,
365399299,
487922291,
583446403,
284971959,
992600579,
1491764681,
1488895948,
542613198,
327962480,
888530356,
930565184,
1260755201,
1498438583,
814517475,
355833469,
331975235,
569077959,
1445450568,
724596345,
1218902777,
1212951043,
493145008,
364147881,
598329798,
407517450,
1495004134,
541933937,
692365393,
316800034,
1480638758,
502635374,
1478579926,
881599819,
426826309,
281796108,
1498229533,
583093664,
534659421,
517329357,
1344702806,
318142137,
366562751,
950424861,
357828853,
1442571346,
410395246,
1158877342,
1115101477,
498151501,
1456732568,
897446215,
586683407,
525818839,
868013618,
1420058690,
1476043340,
1010962391,
364901807,
504631398,
425199399,
320606217,
934030757,
1332100337,
443369807,
1494877874,
1331794412,
467329677,
876080126,
417571834,
1012014442,
1454061614,
405075943,
333647776,
1196434600,
435965836,
382107453,
1216575026,
1057889290,
439438619,
339597578,
1050443738,
1387897651,
836071680,
1488473004,
1095418609,
1043246998,
1499812410,
1229016807,
836215269,
335744614,
366869252,
1479587574,
547436543,
1435729435,
288113403,
589653414,
495369748,
535562583,
466122094,
1075603018,
335023774,
756972930,
465092669,
1031653653,
383298204,
1223932558,
1471527199,
438596432,
1208952944,
335393788,
886427730,
1473024868,
1118431695,
502147249,
298844386,
1452227871,
1456492106,
1495359905,
1375330146,
1115565187,
364191819,
947644950,
1112719759,
399072152,
319881193,
372836496,
1029207872,
1197354394,
395979574,
1424533120,
493390354,
495955880,
340021813,
398596699,
1027688889,
1485247734,
945077360,
628677149,
349554263,
469863705,
1288415553,
734547946,
1245772818,
409366411,
323701765,
345323231,
1444062497,
1451505313,
691797987,
1130498044,
918820076,
305204535,
294056623,
509993510,
320788270,
1116645064,
1176011642,
506627515,
572395608,
638689075,
1337578317,
1453651052,
316126557,
387771637,
698255242,
597077652,
874139669,
749083919,
952516687,
1099997174,
605841731,
429775439,
304158842,
399857015,
1008234539,
561941526,
1464235676,
1455330046,
398129933,
1145275343,
1452992954,
582790430,
1215933788,
1454398991,
398018310,
1165924249,
514485964,
589351740,
315010649,
1483239231,
1449014713,
645682444,
378351144,
1095519285,
1153883316,
568903335,
1478629374,
896501514,
543577420,
686449807,
1358499588,
1500516464,
474349412,
1091944550,
476718980,
375990038,
1138066420,
324906251,
1475440231,
1435807944,
687468969,
1485491919,
518684914,
356336433,
1206967173,
399355755,
1324738781,
1460772578,
1131342792,
995112030,
1451754591,
377672876,
645704840,
1114214294,
1095572547,
1245330908,
1278869953,
1487604820,
368217298,
1092689152,
1011935076,
922793622,
930565469,
1078789949,
804379658,
1217456898,
391093033,
1200318119,
834465911,
1086101495,
1016562846,
1017148055,
595012291,
1445691600,
319740707,
1474917623,
459182288,
1440756080,
356143077,
657527610,
291890420,
1039889567,
1465222322,
998411110,
1191435163,
1333256716,
1324604053,
313259740,
335186209,
362057947,
640111933,
664425773,
1212699939,
338701294,
471394851,
382233851,
1458680693,
1226926872,
727296976,
479280326,
299515267,
463630399,
364387007,
1017551780,
365886172,
1199564834,
1089225191,
376101648,
1475972538,
1332596741,
417970539,
586328581,
1194745615,
1170340094,
971265422,
1450244244,
355736423,
1415559267,
1485552989,
571096102,
1242998361,
510873505,
414461255,
1466588301,
469517819,
1071310449,
388838723,
1447924266,
723815926,
1500575377,
1478738631,
1357464684,
1234806557,
1496579180,
672417831,
410089731,
412050327,
1474598066,
1055281310,
1434400630,
712286167,
1447504309,
546821797,
550221096,
967040652,
1466332776,
580643740,
1448852425,
994905763,
353263352,
409838725,
995373885,
960098950,
1241229134,
1493269955,
458680449,
363920434,
1483571593,
1215301573,
1462762106,
600446812,
372648912,
634598719,
1286341761,
1277029359,
1467252438,
568047979,
425194759,
352509417,
700970012,
884009993,
1278474169,
287170072,
1016673544,
526641427,
1034035493,
1487204113,
714796093,
1022164656,
1494294554,
1480731374,
1479973551,
1233739175,
406033647,
640179084,
1065341814,
937253977,
314498713,
1024818709,
476394945,
542511686,
420009108,
424216726,
1237405816,
875063456,
911793120,
308331524,
1470373330,
481370590,
528413490,
1386235320,
1487212912,
407358186,
334256223,
911130812,
1031312446,
407108860,
498405045,
1469889140,
1124666597,
1193801909,
413936865,
1252497129,
1252850847,
448142450,
534524376,
1383187127,
430921891,
1490571442,
1227019728,
441599004,
1490398089,
287734809,
292987597,
1123102087,
1325756279,
307658513,
891132290,
1492277126,
898040123,
522167641,
1471967529,
358801284,
339883869,
364677107,
1207929487,
1471374791,
457876088,
639881495,
351331194,
579962583,
583009522,
1256027729,
1027352017,
630687854,
1222822904,
1483901554,
1485022388,
619460348,
1023782170,
1371565796,
1180884341,
1476681691,
1489425493,
970107419,
1449612799,
424716908,
1477804587,
1263934706,
372547556,
1109518523,
1492231362,
913943275,
468738585,
310730297,
971233157,
1022267439,
562794249,
336860594,
1494686273,
1274972321,
350189835,
376098999,
300235330,
1276551855,
1384542785,
491076730,
415458524,
1483295074,
1036661603,
1176428281,
1000017994,
1400579543,
403858572,
1470395927,
327370808,
1072898300,
438429273,
552764013,
1055502792,
1203077485,
305939712,
905869418,
1018368216,
597986893,
517914548,
1483027214,
495583717,
689501356,
709482991,
719525810,
1441648201,
874978299,
366247306,
390531553,
840919914,
1258654743,
1473951944,
1467316099,
1156192844,
514374715,
1280992496,
1064955217,
342792281,
420455839,
1454056744,
1454010667,
1413668881,
576649830,
1130616675,
1266591536,
404059113,
1462317543,
860822992,
529997671,
942571931,
1133785983,
620067490,
515094775,
1386566985,
1437783446,
418965252,
1340841589,
899247664,
466965151,
295076329,
1044456188,
1057968965,
448474183,
457446957,
1264546236,
1484825317,
392988420,
885982973,
358821736,
839285684,
1463186310,
589625334,
484672289,
1287596508,
1436132657,
887947640,
885271158,
499725337,
582654048,
399563465,
301987699,
286906691,
1498817833,
1089336971,
409625068,
397648381,
1488312219,
1323957120,
1491729217,
805603214,
878691772,
918263528,
1458609369,
1474992087,
1022579925,
1442061397,
489689106,
1465092216,
863844475,
1447871511,
1251584149,
1087187361,
710535379,
332510494,
693689912,
1099771240,
722804810,
530957474,
719219382,
792750948,
336381998,
1462818858,
555798051]

list_sf=[100,
118,
103,
114,
104,
110,
116,
115,
124,
111,
105,
102,
122,
544007664,
121,
112,
284882215,
101,
915056765,
454638411,
127,
585027354,
297606951,
447188370,
389801252,
422689480,
363590051,
324684580,
429047995,
107,
333903271,
284815942,
117,
284035177,
113,
835599320,
535886823,
310633997,
123,
295646461,
525463029,
126,
119,
962194608,
109,
338137227,
351727428,
323229106,
519817714,
282614216,
1446075923,
951937596,
842842640,
364709193,
376510438,
284910350,
288429040,
368677368,
384830320,
298867247,
284993459,
283646709,
1232058109,
331177714,
336353151,
407558537,
545519333,
719972451,
507874739,
711923939,
317469184,
1064216828,
1069511734,
297430070,
944011620,
284847138,
282935706,
414706506,
530621395,
985746746,
868077558,
529379082,
462638897,
335364882,
375380948,
125,
510855668,
1069509450,
1110145103,
311548709,
1174078549,
310738695,
392796698,
431946152,
416023011,
924620788,
352683833,
1132762804,
640360962,
909319292,
926252661,
300048137,
302584613,
522826277,
500003565,
1287282214,
309172177,
974748812,
1058959277,
587366035,
1094591345,
842849113,
379693831,
281940292,
922103212,
327630330,
547702041,
1462082664,
586447913,
361309726,
290638154,
1207472156,
470412147,
106,
342792525,
1374403536,
341232718,
301724680,
367623543,
894546091,
477128284,
1017492454,
482066631,
468996152,
1096918571,
588013838,
338010821,
460177396,
322439990,
488818252,
570060128,
406719683,
736179781,
919087726,
643496868,
302920553,
293622097,
553834731,
401626263,
986339882,
309735670,
1196764367,
304878510,
512393983,
361304891,
408709785,
331786748,
680819774,
1261357853,
403901186,
1038369065,
998754894,
479516143,
836767708,
305343404,
912561374,
490217893,
362348516,
930441707,
382952264,
464988855,
386678211,
344542975,
436491861,
307906541,
973741088,
971023427,
480883488,
512939461,
317951436,
472014516,
552602056,
300704847,
361285480,
879478102,
591560124,
342527639,
529479190,
395545555,
1477841973,
444553167,
388491656,
336698281,
1176027022,
535509415,
1482766542,
477537958,
945416273,
1293634699,
389781154,
938003185,
422663827,
546473125,
886445756,
382698565,
1053012308,
427916203,
410896080,
381471023,
1495369374,
967351793,
1494449873,
723134859,
1010729050,
485357017,
983156458,
731629156,
357218860,
284862083,
777875529,
1223471316,
776010987,
664939913,
331763096,
621574163,
312325565,
443904275,
1195621598,
1113153706,
296581815,
878577184,
936971630,
458734623,
579581125,
449945214,
1065781769,
559887125,
448639966,
618783545,
1288723196,
382497397,
1182474649,
367003839,
405383140,
1492451796,
1486214495,
1487968838,
656971078,
1453989822,
318698524,
284876795,
1056813463,
314855255,
286058814,
306310789,
469284907,
1407852246,
571800810,
321560858,
457954781,
486154808,
309465525,
349731802,
339532909,
904052407,
1193350206,
896130944,
414478124,
543186831,
1443446174,
436672029,
300238550,
1490384223,
382617920,
469337564,
1149449468,
1149994032,
638323895,
989804926,
1095459556,
1345968745,
1486931468,
1497873581,
1452526406,
368494609,
359917414,
1413942319,
1457853413,
545599256,
1423046460,
1477636326,
418075935,
307184892,
1105855019,
1384542200,
329913454,
341036067,
429610587,
308750436,
561930308,
646100661,
1022831885,
406889139,
414012602,
389638243,
355554941,
502912815,
657189652,
328412701,
307386350,
1389111413,
850417475,
883324671,
1087101090,
434893913,
1128712763,
1218465964,
509199715,
549643634,
663592361,
1491328118,
297368629,
1076402606,
303113127,
393328150,
293523031,
336377331,
450722833,
418987775,
1131203560,
1496150467,
1498982970,
456034437,
1500010832,
651510680,
370811491,
348177453,
455004730,
521487551,
595287172,
1482395931,
442839435,
1193508329,
530168168,
844570015,
491730359,
285692706,
711410889,
314716233,
749133753,
388627783,
1481293953,
947984433,
901941015,
1440430680,
555376968,
456282559,
1216396545,
320029256,
1493125671,
1487932387,
890378044,
1053285387,
635150066,
327228455,
867827909,
930574573,
399452287,
1491206418,
393048976,
940247939,
487285735,
330376830,
331308914,
1479551182,
1014164514,
878783582,
521207075,
1046510029,
1367363988,
725097967,
921765888,
1490632518,
1480236789,
1498901771,
986999874,
328415391,
1178765645,
1475887491,
1459289784,
288487321,
561625752,
1474046667,
593715088,
880047117,
561311442,
1264782561,
942608209,
467738064,
324715238,
411766326,
955286870,
414113282,
1490423596,
324613447,
491126018,
855627886,
364376344,
1111876388,
664575829,
1181632589,
1492913398,
514643583,
1488895948,
352969997,
493619333,
1330123889,
930565184,
925338276,
1498438583,
1285713171,
527588389,
583446403,
284235722,
454607051,
724596345,
966758561,
1220976145,
997362197,
487922291,
1495004134,
1344700142,
919745844,
589698942,
880178264,
331975235,
814517475,
1487720650,
540518599,
569077959,
327962480,
407517450,
355833469,
1038653883,
598329798,
490179405,
1260755201,
1462556579,
1080248000,
924373886,
542613198,
541933937,
493145008,
502635374,
992600579,
1218902777,
316800034,
517329357,
888530356,
881599819,
1205990992,
410395246,
281796108,
1332100337,
284971959,
1212951043,
364901807,
443369807,
357828853,
364147881,
320606217,
1453411110,
318142137,
897446215,
1144258115,
586683407,
1488473004,
1499812410,
1158877342,
1012014442,
366562751,
498151501,
934030757,
439438619,
425199399,
868013618,
504631398,
365399299,
950424861,
1179915619,
1115101477,
1216575026,
1095418609,
589653414,
1229016807,
288113403,
547436543,
1442571346,
1031653653,
836215269,
1010962391,
1484932356,
1480638758,
426826309,
1445450568,
382107453,
339597578,
333647776,
1435729435,
1050443738,
417571834,
1208952944,
1452227871,
1344702806,
1473024868,
495369748,
366869252,
335393788,
1043246998,
1118431695,
876080126,
1223932558,
383298204,
465092669,
1115565187,
1456732568,
1476043340,
1057889290,
335744614,
335023774,
495955880,
435965836,
1288415553,
583093664,
319881193,
535562583,
1424533120,
1478579926,
395979574,
493390354,
349554263,
1479587574,
340021813,
467329677,
398596699,
399072152,
886427730,
320788270,
1029207872,
438596432,
502147249,
691797987,
1130498044,
1451505313,
1245772818,
1331794412,
534659421,
1454061614,
1215933788,
1176011642,
345323231,
1471527199,
1165924249,
749083919,
1495359905,
1197354394,
305204535,
1008234539,
405075943,
734547946,
315010649,
514485964,
1116645064,
1456492106,
945077360,
1337578317,
1453651052,
1491764681,
364191819,
1478629374,
698255242,
398129933,
692365393,
1444062497,
628677149,
1500516464,
918820076,
1483239231,
378351144,
1112719759,
1475440231,
399857015,
316126557,
952516687,
896501514,
1027688889,
836071680,
1498229533,
429775439,
323701765,
1387897651,
1075603018,
1091944550,
645682444,
1013231476,
686449807,
568903335,
324906251,
509993510,
304158842,
1460772578,
399355755,
638689075,
874139669,
387771637,
543577420,
1095519285,
409366411,
1206967173,
922793622,
474349412,
597077652,
1196434600,
1131342792,
1200318119,
1153883316,
1420058690,
1217456898,
930565469,
1011935076,
1464235676,
1435807944,
368217298,
375990038,
466122094,
1099997174,
356336433,
391093033,
947644950,
1452992954,
1324738781,
1212699939,
995112030,
582790430,
1145275343,
362057947,
834465911,
479280326,
1138066420,
1095572547,
1114214294,
338701294,
1358499588,
471394851,
1449014713,
1485247734,
417970539,
727296976,
645704840,
506627515,
1092689152,
572395608,
1086101495,
657527610,
998411110,
355736423,
463630399,
388838723,
518684914,
1500575377,
1245330908,
1089225191,
398018310,
1415559267,
356143077,
525818839,
377672876,
294056623,
664425773,
971265422,
1055281310,
640111933,
469517819,
299515267,
589351740,
1017148055,
476718980,
510873505,
605841731,
364387007,
995373885,
1194745615,
365886172,
335186209,
459182288,
687468969,
469863705,
458680449,
1451754591,
1226926872,
804379658,
1016562846,
672417831,
595012291,
313259740,
414461255,
1475972538,
561941526,
1333256716,
1474917623,
412050327,
352509417,
700970012,
363920434,
287170072,
1078789949,
372836496,
634598719,
600446812,
1332596741,
314498713,
526641427,
1065341814,
1465222322,
372648912,
960098950,
1022164656,
1039889567,
382233851,
568047979,
1034035493,
1241229134,
1455330046,
884009993,
546821797,
640179084,
1480731374,
712286167,
353263352,
406033647,
1487604820,
550221096,
723815926,
528413490,
1490398089,
1434400630,
1278869953,
580643740,
1454398991,
1445691600,
1191435163,
319740707,
1017551780,
287734809,
448142450,
937253977,
407108860,
1447504309,
1207929487,
441599004,
476394945,
1485552989,
714796093,
481370590,
430921891,
1494877874,
1215301573,
498405045,
967040652,
891132290,
1469889140,
911793120,
1278474169,
1483901554,
1227019728,
1199564834,
1386235320,
1448852425,
1493269955,
308331524,
911130812,
1237405816,
424216726,
994905763,
579962583,
1222822904,
1124666597,
1016673544,
534524376,
364677107,
1470373330,
1357464684,
1233739175,
1467252438,
1123102087,
358801284,
1031312446,
420009108,
372547556,
522167641,
413936865,
407358186,
1474598066,
1109518523,
1440756080,
1071310449,
1471374791,
350189835,
571096102,
1487204113,
1383187127,
298844386,
468738585,
1027352017,
310730297,
1477804587,
875063456,
1242998361,
292987597,
905869418,
351331194,
339883869,
291890420,
376101648,
410089731,
1022267439,
415458524,
619460348,
1252850847,
1203077485,
305939712,
336860594,
1471967529,
403858572,
1466588301,
1277029359,
542511686,
1492231362,
366247306,
719525810,
1449612799,
840919914,
913943275,
562794249,
1371565796,
1193801909,
1324604053,
1023782170,
971233157,
517914548,
424716908,
491076730,
874978299,
1000017994,
689501356,
327370808,
1375330146,
409838725,
1280992496,
898040123,
1180884341,
1256027729,
583009522,
1485491919,
1447924266,
639881495,
709482991,
425194759,
334256223,
1458680693,
438429273,
300235330,
418965252,
1176428281,
1263934706,
376098999,
1072898300,
457446957,
495583717,
1276551855,
1498817833,
1462762106,
342792281,
1055502792,
404059113,
576649830,
420455839,
1478738631,
885271158,
942571931,
1156192844,
392988420,
1454010667,
1496579180,
1400579543,
1170340094,
484672289,
515094775,
1266591536,
390531553,
970107419,
457876088,
307658513,
358821736,
1467316099,
1437783446,
1036661603,
1463186310,
1494294554,
1462818858,
514374715,
301987699,
1458609369,
1485022388,
399563465,
589625334,
630687854,
1413668881,
1133785983,
1436132657,
899247664,
620067490,
1064955217,
597986893,
1484825317,
805603214,
1447871511,
1089336971,
1258654743,
885982973,
286906691,
1054033219,
1130616675,
409625068,
1274972321,
1057968965,
499725337,
1087187361,
860822992,
887947640,
555798051,
529997671,
1219889310,
756972930,
1384542785,
1136238277,
1466332776,
863844475,
306621789,
621469412,
878691772,
1168348542,
449141888,
530957474,
700740807,
1386566985,
466965151,
1363863879,
1462317543,
1044456188,
336381998,
485126024,
918263528,
1358222641,
1470395927,
332510494,
793039965,
290051590,
1466582065,
489689106,
839285684,
1047246341,
1346179411,
561350520,
582654048,
1494267894,
1490571442,
1052879175,
1261483635,
578830929,
415894489,
295076329,
1024818709,
397648381,
1251584149,
896112560,
971585438,
1287596508,
1153443320,
388082488,
1483571593,
596402997,
909351158,
1500610643,
1340841589,
1488581193,
337472899,
1085652055,
445553058,
326347260]

# diff_list = list(set(list_plproxy) - set(list_sf))
diff_list = list(set(list_sf) - set(list_plproxy))

in_pl_not_in_sf=list(set(list_plproxy).difference(set(list_sf)))
diff_num1=len(in_pl_not_in_sf)
print diff_num1 

in_sf_not_in_pl=list(set(list_sf).difference(set(list_plproxy)))
diff_num2=len(in_sf_not_in_pl)
print diff_num2
print diff_list