In [0]:


import pandas as pd
from pyspark.sql import functions as F
from datetime import datetime
from applications.db_check_v1.common.db_check_utils import query
from conf.settings import PG_USAGE_HOSTS, PG_USAGE_NAME, PG_USAGE_ACCESS_ID, PG_USAGE_SECRET_KEY, \
    CITUS_USAGE_NAME, CITUS_USAGE_ACCESS_ID, CITUS_USAGE_HOSTS, CITUS_USAGE_SECRET_KEY

PLPROXY_DSN = (
    "dbname='{db}' user='{user}' password='{password}' "
    "host='{host}' port='{port}'".format(
        db=PG_USAGE_NAME,
        user=PG_USAGE_ACCESS_ID,
        host=PG_USAGE_HOSTS[0][0],
        password=PG_USAGE_SECRET_KEY,
        port=PG_USAGE_HOSTS[0][1]
    )
)

def get_date_list(begin_date, end_date, freq):
    date_list = [x.strftime('%Y-%m-%d') for x in list(pd.date_range(start=begin_date, end=end_date, freq=freq))]
    return date_list


begin_date = datetime(2010, 1, 1)
end_date = datetime(2020, 8, 31)

DATE_GRANULARITY_MAPPINGLIST = {
    "daily": get_date_list(begin_date, end_date, "D"),
    "weekly": get_date_list(begin_date, end_date, "W-SAT"),
    "monthly": get_date_list(begin_date, end_date, "M")
}


DATE_GRANULARITY_MAPPINGLIST["monthly"].reverse()

#print(DATE_GRANULARITY_MAPPINGLIST["weekly"])

sql_template_a_x_a = """
select kpi, sum(count_a) as app_count, sum(sum_e) as sum_e from plproxy.execute_select_nestloop($proxy$ 
    select kpi, count(estimate) as count_a, sum(estimate) as sum_e
    from ca.app_monthly
    where 
        date ='{date}'
    group by kpi order by kpi desc
$proxy$) tpl (kpi smallint, count_a bigint, sum_e double precision ) group by kpi order by kpi desc ;
"""

sql_template = """
select sum(count) as count, sum(affinity_sum) as affinity_sum,  sum(up_sum) as up_sum from plproxy.execute_select_nestloop($proxy$ 
    select count(*) as count, sum(est_cross_product_affinity) as affinity_sum,  sum(est_cross_product_usage_penetration) as up_sum
    from mw.{db_name}_m
    where 
        date ='{date}'
$proxy$) tpl (count bigint, affinity_sum real,up_sum real ) ;

"""
DATE_GRANULARITY_MAPPINGLIST["monthly"] = []
DATE_GRANULARITY_MAPPINGLIST["monthly"] = ["2020-03-31","2020-04-30","2020-05-31","2020-06-30","2020-07-31"]

for date_str in DATE_GRANULARITY_MAPPINGLIST["monthly"]:
    try:
        # app x app
        result = query(PLPROXY_DSN, sql_template_a_x_a.format(date=date_str))
        count =result[0][1]
        affinity_sum = result[0][2]
        up_sum = result[1][2]
        # app x domain, domain x domain, domain x app
        for db_name in [ "domain_x_domain"]:
            result = query(PLPROXY_DSN, sql_template.format(date=date_str, db_name=db_name))
            count += result[0][0]
            affinity_sum += result[0][1]
            up_sum += result[0][2]
            print "{},{}".format(db_name, result[0] )
        print "{},{},{:.20g},{:.20g}".format(date_str,count, up_sum, affinity_sum )
    except Exception, e:
        print "{},{}".format(date_str, "ERROR")



In [0]:
%%sh

PGPASSWORD='2mHdFW6%#REu' psql -h internal-aa-prod-usage-plproxy-internal-1640809782.us-east-1.elb.amazonaws.com -U app_bdp_usage_qa -d cohort -p 7432 << EOF 

-- adhoc

-- total
select 
	reltuples::int as total 
from 
	pg_class 
where 
	relname = 'table_name' 
	and relnamespace = (select oid from pg_namespace where nspname = 'schema');



select device_code, sum(count_a) from plproxy.execute_select(\$proxy\$
    SELECT device_code, count(*) as count_a FROM mw.domain_x_domain_m where date = '2020-05-31' group by device_code \$proxy\$)
 t ( device_code char(2),count_a bigint) group by device_code ;

EOF


In [0]:



# spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-usage/unified/usage.cross-product.v6/fact/cross_type=app_cross_app/granularity=monthly/date=2020-06-30/").createOrReplaceTempView("cross")
spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-mobileweb-paid/unified/mobileweb.cross-domain.v4/fact/granularity=m/month=202005/").createOrReplaceTempView("cross_m_m_legacy")


spark.sql("select count(*) from cross_m_m_legacy").show(20)
