In [0]:

spark.sparkContext.addPyFile("/home/hadoop/bdp/application/libs/python/dependencies.zip")
import pandas as pd
pd.set_option('expand_frame_repr', False)
import datetime
from applications.db_check_v1.common.base_test import PipelineTest
from applications.db_check_v1.common.db_check_utils import query_df, etl_skip
from applications.db_check_v1.common.constants import COUNTRY_CODE_MAPPING_BY_MARKET_CODE as COUNTRY_CODE_MAPPING, \
    CATEGORY_ID_MAPPING_BY_MARLKET_AND_DEVICE_CODE as CATEGORY_ID_MAPPING
from applications.db_check_v1.common.utils import get_week_start_end_date, get_date_list
#from applications.db_check_v1.cases.store.publisher_est_v1.constants import MARKET_SIZE_DSN

def get_date_list(begin_date, end_date, freq):
    date_list = [x.strftime('%Y-%m-%d') for x in list(pd.date_range(start=begin_date, end=end_date, freq=freq))]
    return date_list

import boto3
s3 = boto3.resource('s3')
s3object = s3.Object('b2c-prod-data-pipeline-qa', 'tom/top_publisher/regression_plproxy.txt')

def write_log(strobj, s3obj):
    s3obj.put(Body=str(strobj))


# Copyright (c) 2018 App Annie Inc. All rights reserved.
# pylint: disable=E1101,C0412,C1801,C0201

"""
DB Check modules
"""

import datetime

from applications.db_check_v1.common.base_test import PipelineTest
from applications.db_check_v1.common.db_check_utils import query_df, etl_skip
from applications.db_check_v1.common.constants import COUNTRY_CODE_MAPPING_BY_MARKET_CODE as COUNTRY_CODE_MAPPING, \
    CATEGORY_ID_MAPPING_BY_MARLKET_AND_DEVICE_CODE as CATEGORY_ID_MAPPING
from applications.db_check_v1.common.utils import get_week_start_end_date, get_date_list
from applications.db_check_v1.cases.usage.basic_kpi_v3.test_basic_kpi_v3_routine_plproxy import CITUS_DSN as PUB_EST_DSN
from applications.db_check_v1.cases.store.app_est_publisher_v1.constants import PUB_EST_DSN, PUB_EST_DB_METRICS


class PublisherEstRawData(object):
    raw_s3_path = "s3://b2c-prod-dca-store-estimates/store_estv2/PUB_ESTIMATES_{}/version=2.0.0/range_type=DAY" \
                  "/date={}/"
    device_code_mapping = {
        "00": "android-all",
        "01": "android-all",
        "02": "android-all",
        "10": "ios-phone",
        "11": "ios-phone",
        "12": "ios-phone",
        "1100": "ios-tablet",
        "1101": "ios-tablet",
        "1102": "ios-tablet",
        # "21000": "ios-all",
        # "21001": "ios-all",
        # "21002": "ios-all",
    }

    metric_mapping = {
        0: "free_app_download",
        1: "paid_app_download",
        2: "revenue",
        101: "free_app_download",
        100: "paid_app_download",
        102: "revenue",
        # 1000: "free_app_download",
        # 1001: "paid_app_download",
        # 1002: "revenue"
    }

    dimension_mapping = {
        "id": "publisher_id",
    }

    def __init__(self, spark):
        self.spark = spark

    def get(self, date, country_code, stage="final"):
        df = self._get_raw_data_by_date_country(date, country_code, stage)
        df = self._parse_mapping(df)
        df = self._parse_unified_format(df)
        df = self._data_clean_up(df)
        return df

    def _data_clean_up(self, df):
        # clean unknown mapping
        category_id_list = list(set(CATEGORY_ID_MAPPING["apple-store"]["ios-all"].values() +
                                    CATEGORY_ID_MAPPING["google-play"]["android-all"].values()))

        country_code_list = list(set(COUNTRY_CODE_MAPPING["apple-store"].values() +
                                     COUNTRY_CODE_MAPPING["google-play"].values()))

        df = df[(df['category_id'].isin(category_id_list)) & (df['country_code'].isin(country_code_list))]
        return df

    def _parse_mapping(self, df):
        # country_code mapping
        df.loc[df["platform_id"] == 0] = df.loc[df["platform_id"] == 0].\
            replace({"store_id": COUNTRY_CODE_MAPPING["google-play"]})
        df.loc[df["platform_id"] == 1] = df.loc[df["platform_id"] == 1].\
            replace({"store_id": COUNTRY_CODE_MAPPING["apple-store"]})
        df = df.rename(columns={'store_id': 'country_code'})

        # category_id mapping
        df.loc[df["platform_id"] == 0] = df.loc[df["platform_id"] == 0].\
            replace({"category_id": CATEGORY_ID_MAPPING["google-play"]["android-all"]})
        df.loc[df["platform_id"] == 1] = df.loc[df["platform_id"] == 1].\
            replace({"category_id": CATEGORY_ID_MAPPING["apple-store"]["ios-all"]})

        # device_code mapping
        df["device_code"] = df["platform_id"].astype(str) + df["feed"].astype(str)
        df = df.replace({"device_code": self.device_code_mapping})

        # granularity
        df["granularity"] = "daily"

        # metrics mapping (from feed)
        df = df.replace({"feed": self.metric_mapping})
        return df

    def _parse_unified_format(self, df):
        df = df.rename(columns=self.dimension_mapping)
        df = df.pivot_table(index=["publisher_id", "category_id", "device_code", "country_code", "granularity"],
                            columns='feed', values='est')
        df.reset_index(inplace=True)
        df.columns.name = None
        return df

    def _get_raw_data_by_date_country(self, date, country_code, stage):
        """
        +----------+--------+-----------+-----------+--------+----+----+-----+--------+
        |        id|store_id|category_id|platform_id|vertical|rank|feed|  est|platform|
        +----------+--------+-----------+-----------+--------+----+----+-----+--------+
        | 284417353|       0|       6006|          1|       1|   1|1002|45235|     ios|
        | 349554266|       0|       6006|          1|       1|   2|1002|20732|     ios|
        |1316153435|       0|       6006|          1|       1|   3|1002|15136|     ios|
        +----------+--------+-----------+-----------+--------+----+----+-----+--------+
        """
        ios_store_ids = [str(k) for k, v in COUNTRY_CODE_MAPPING["apple-store"].items() if v == country_code]
        gp_store_ids = [str(k) for k, v in COUNTRY_CODE_MAPPING["google-play"].items() if v == country_code]
        raw_df = self.spark.read.parquet(self.raw_s3_path.format(date, stage.upper())).\
            filter('store_id in ({})'.format(",".join(ios_store_ids + gp_store_ids))).toPandas()
        return raw_df

    def get_metrics_count(self, date, stage="final"):
        ios_store_id = [str(s_id) for s_id in COUNTRY_CODE_MAPPING["apple-store"].keys()]
        gp_store_id = [str(s_id) for s_id in COUNTRY_CODE_MAPPING["google-play"].keys()]
        # ios_category_id = [str(c_id) for c_id in CATEGORY_ID_MAPPING["apple-store"]["ios-all"].keys()]
        # gp_category_id = [str(c_id) for c_id in CATEGORY_ID_MAPPING["google-play"]["android-all"].keys()]

        fillter_sql = "platform_id = {} and store_id in ({}) and feed in ({})"
        df = self.spark.read.parquet(self.raw_s3_path.format(stage.upper(), date))
        feed_ids_sql = ",".join([str(x) for x in self.metric_mapping.keys()])

        count_all = df.filter(fillter_sql.format(1, ",".join(ios_store_id), feed_ids_sql)).count() + \
            df.filter(fillter_sql.format(0, ",".join(gp_store_id), feed_ids_sql)).count()
        return count_all

    def get_v1_raw_metrics_count(self, date):
        ios_store_id = [str(s_id) for s_id in COUNTRY_CODE_MAPPING["apple-store"].keys()]
        gp_store_id = [str(s_id) for s_id in COUNTRY_CODE_MAPPING["google-play"].keys()]

        df = self.spark.read.option("delimiter", "\t").csv(
            "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{}/*/sbe_est_publisher/*/*.csv.gz".format(date))
        fillter_sql = "_c2 = {} and _c0 in ({})"  # _c2 > platform_id, _c0 > store_id

        count_all = df.filter(fillter_sql.format(1, ",".join(ios_store_id))).count() + \
            df.filter(fillter_sql.format(0, ",".join(gp_store_id))).count()
        return count_all


class PublisherEstUnifiedData(object):
    unified_s3_path = "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-publisher-dna-log.v1/" \
                      "fact/granularity=daily/date={}/"
    available_device_codes = ['ios-phone', 'ios-tablet', 'android-all']

    def __init__(self, spark):
        self.spark = spark

    def get(self, date, country_code):
        unified_df = self.spark.read.format("delta").\
            load(self.unified_s3_path.format(date)).filter("country_code = '{}'".format(country_code)).toPandas()
        unified_df = unified_df.drop(["_identifier", "revenue_iap", "revenue_non_iap", "date"], axis=1)
        return unified_df

    def get_metrics_count(self, date):
        df = self.spark.read.format("delta").load(self.unified_s3_path.format(date))
        metrics_count = 0
        device_code_list_sql = "','".join(self.available_device_codes)
        for metric in PUB_EST_DB_METRICS:
            metrics_count += df.filter("device_code in ('{}') and {} is not null".
                                       format(device_code_list_sql, metric)).count()
        return metrics_count


class PublisherEstDBData(object):
    def get(self, date):
        sql = "SELECT * FROM store.store_est_publisher_fact_v2 WHERE date='{}'".format(date)
        return query_df(PUB_EST_DSN, sql)

    def get_metrics_count(self, date):
        metrics_count = 0
        for metric in PUB_EST_DB_METRICS:
            sql = "SELECT count(*) AS metrics_count FROM store.store_est_publisher_fact_v2 " \
                  "WHERE date='{}' AND {} IS NOT NULL".format(date, metric)
            data = query_df(PUB_EST_DSN, sql)
            metrics_count += data.loc[0].metrics_count
        return metrics_count


class TestPublisherEstFinalWeekly(PipelineTest):
    # Every Tuesday 15:00 UTC(23:00 BJ) time will refresh the data of last Full Week.
    trigger_date_config = ('* 15 * * 2', 3)

    @etl_skip()
    def test_publisher_est_final_weekly_etl_completeness(self):
        start_date, end_date = get_week_start_end_date(self.check_date_str)
        date_list = get_date_list(start_date, end_date)
        for date in date_list:
            raw_count = PublisherEstRawData(self.spark).get_metrics_count(date, "final")
            unified_count = PublisherEstUnifiedData(self.spark).get_metrics_count(date)
            db_count = PublisherEstDBData().get_metrics_count(date)
            self.assertEqual(raw_count, unified_count,
                             'raw count:{}, unified count:{}, date:{}'.format(raw_count, unified_count, date))
            self.assertEqual(raw_count, db_count,
                             'raw count:{}, db count:{}, date:{}'.format(raw_count, db_count, date))
            self.assertTrue(db_count > 0, "db count is 0")

    def test_publisher_est_etl_final_timelines(self):
        # Every Tuesday 15:00 UTC(23:00 BJ) time will refresh the data of last Full Week.
        # E.g. 2020-02-12 10:00 the data of 2020-02-02 ~ 2020-02-08 will be ready
        trigger_datetime = datetime.datetime.strptime("2020-02-12 9:00:00", '%Y-%m-%d %H:%M:%S')
        check_date_str_actual = self._get_check_date_from_routing_config(trigger_datetime).strftime("%Y-%m-%d")
        self.assertEqual("2020-02-08", check_date_str_actual)

    # @etl_skip()
    # def test_publisher_est_etl_accuracy(self):
    #     # Every Tuesday 16:00 UTC time will refresh the data of last Full Week.
    #     country_code = 'US'
    #     start_date, end_date = get_week_start_end_date(self.check_date_str)
    #     date_list = get_date_list(start_date, end_date)
    #     for date in date_list:
    #         raw_df = PublisherEstRawData(self.spark).get(date, country_code)
    #         unified_df = PublisherEstUnifiedData(self.spark).get(date, country_code)
    #         db_df = PublisherEstDBData().get(date)
    #
    #         self._compare_df(raw_df, unified_df, log="raw / unified - {}".format(date))
    #         self._compare_df(unified_df, db_df, log="unified / db - {}".format(date))


class TestPublisherEstPreviewDaily(PipelineTest):
    # Every day 15:00 UTC(23:00 BJ) time will refresh the data of last Full Week.
    trigger_date_config = ('* 15 * * *', 1)

    def test_publisher_est_preview_daily_etl_completeness(self):
        raw_count = PublisherEstRawData(self.spark).get_metrics_count(self.check_date_str, 'preview')
        unified_count = PublisherEstUnifiedData(self.spark).get_metrics_count(self.check_date_str)
        db_count = PublisherEstDBData().get_metrics_count(self.check_date_str)
        self.assertEqual(raw_count, unified_count)
        self.assertEqual(raw_count, db_count)
        self.assertTrue(db_count > 0)




In [0]:



from applications.db_check_v1.common.db_check_utils import query

begin_date = datetime.datetime(2020, 8, 9)
end_date = datetime.datetime(2020, 8, 18)

sql_str = """
select sum(count_a) from plproxy.execute_select_nestloop($proxy$ 
    select count(pub_id) as counta
    from pp.pub_store_daily_estimate_{}
    where 
        date = '{}' and {}
$proxy$) tbl (count_a BIGINT);

"""

plproxy_dsn = (
    "dbname='{db}' user='{user}' password='{password}' "
    "host='{host}' port='{port}'".format(
        db="dailyest",
        user="app_bdp_usage_qa",
        host="internal-aa-prod-plproxy-internal-4-329644124.us-east-1.elb.amazonaws.com",
        password="2mHdFW6%#REu",
        port=7432
    )
)

date_list = get_date_list(begin_date, end_date, "D")

# 143547 143601
ios_c_f = "store_id in (0,143441,143442,143443,143444,143445,143446,143447,143448,143449,143450,143451,143452,143453,143454,143455,143456,143457,143458,143459,143460,143461,143462,143463,143464,143465,143466,143467,143468,143469,143470,143471,143472,143473,143474,143475,143476,143477,143478,143479,143480,143481,143482,143483,143484,143485,143486,143487,143488,143489,143491,143492,143493,143494,143495,143496,143497,143498,143499,143500,143501,143502,143503,143504,143505,143506,143507,143508,143509,143510,143511,143512,143513,143514,143515,143516,143517,143518,143519,143520,143521,143523,143524,143525,143526,143527,143528,143529,143530,143531,143532,143533,143534,143535,143536,143537,143538,143539,143540,143541,143542,143543,143544,143545,143546,143548,143549,143550,143551,143552,143553,143554,143555,143556,143557,143558,143559,143560,143561,143562,143563,143564,143565,143566,143567,143568,143570,143571,143572,143573,143574,143575,143576,143577,143578,143579,143580,143581,143582,143583,143584,143585,143586,143587,143588,143589,143590,143591,143592,143593,143594,143595,143597,143598,143599,143600,143602,143603,143604,143605,143606,143608,143609,143610,143612,143613,143614,143615,143617,143619,143620,143621,143622,143624)" 
gp_c_f ="store_id in (1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,1000)"

log= ''
for date in date_list:
    try:
        count_ios = query(plproxy_dsn, sql_str.format( 0, date, gp_c_f))[0][0]
        count_gp = query(plproxy_dsn, sql_str.format( 1, date, ios_c_f))[0][0]
        # count_agg = query(plproxy_dsn, sql_str.format( 2, date, ios_c_f))[0][0]
        # log_tmp =  "{},{}".format(date,count_ios+count_gp+count_agg)
        # log = log +log_tmp
        # write_log(log, s3object)
        print "{},{},{},{},{}".format(date, PublisherEstRawData(spark).get_metrics_count(date, stage='preview'), PublisherEstUnifiedData(spark).get_metrics_count(date), PublisherEstDBData().get_metrics_count(date), count_ios+count_gp)
    except Exception, e:
        log_tmp =  "{},{}".format(date,'error')
        # log = log + log_tmp
        # write_log(log, s3object)
        print e.message
        # date_list.append(date)
        print log_tmp

    


    



In [0]:
%%sh

PGPASSWORD='2mHdFW6%#REu' psql -h internal-aa-prod-plproxy-internal-4-329644124.us-east-1.elb.amazonaws.com -U app_bdp_usage_qa -d dailyest -p 7432 << EOF 
select date, sum(count_a) from plproxy.execute_select_nestloop(\$proxy\$ 
    select date, count(pub_id) as counta
    from pp.pub_store_daily_estimate
    where 
        date between '2020-07-18' and '2020-07-31' group by date
\$proxy\$) tbl (date Date, count_a BIGINT)  group by date  order by date asc;



EOF



In [0]:
%%sh


PGPASSWORD='2mHdFW6%#REu' psql -h internal-aa-prod-plproxy-internal-4-329644124.us-east-1.elb.amazonaws.com -U app_bdp_usage_qa -d dailyest -p 7432 << EOF 
select store_id, sum(count_a) from plproxy.execute_select_nestloop(\$proxy\$ 
    select store_id,count(pub_id) as counta
    from pp.pub_store_daily_estimate_2
    where 
        date = '2020-03-22' and store_id= 143547
    group by store_id
\$proxy\$) tbl ( store_id INT, count_a BIGINT) group by store_id order by  sum(count_a) asc;

EOF


# PGPASSWORD='2mHdFW6%#REu' psql -h internal-aa-prod-plproxy-internal-4-329644124.us-east-1.elb.amazonaws.com -U app_bdp_usage_qa -d dailyest -p 7432 << EOF 
# select category_id, sum(count_a) from plproxy.execute_select_nestloop(\$proxy\$ 
#     select category_id,count(pub_id) as counta
#     from pp.pub_store_daily_estimate_0
#     where 
#         date = '2020-03-22' AND store_id=50
#     group by category_id
# \$proxy\$) tbl ( category_id INT, count_a BIGINT) group by category_id order by  sum(count_a) asc;

# EOF



In [0]:
%%sh

PGPASSWORD='wZw8cfBuuklIskVG' psql -h 10.2.6.141  -U citus_bdp_prod_app_int_qa -d aa_store_db -p 5432 << EOF 

select date,sum(count_a) as count_b from (
 (select date,count(publisher_id) as count_a from store.store_est_publisher_fact_v1 where date between '2020-07-18' and '2020-07-31' and device_code in ('ios-phone', 'ios-tablet') and est_free_app_download is not null group by date) 
 UNION ALL
 (select date,count(publisher_id) as count_a from store.store_est_publisher_fact_v1 where date between '2020-07-18' and '2020-07-31' and device_code in ('ios-phone', 'ios-tablet') and est_paid_app_download is not null group by date)
 UNION ALL
 (select date,count(publisher_id) as count_a from store.store_est_publisher_fact_v1 where date between '2020-07-18' and '2020-07-31' and device_code in ('ios-phone', 'ios-tablet') and est_revenue is not null group by date)
) as b where country_code='MS' group by b.date order by b.date asc;

-- select category_id,count(publisher_id) as count_a from store.store_est_publisher_fact_v1 where date='2020-03-22' and device_code in ('android-all') and country_code='KW' group by category_id order by count_a asc;

-- select * from store.store_est_publisher_fact_v1 where  where date='2020-03-22' and device_code in ('android-all') and country_code='KW' and category_id=400004 order by publisher_id asc;
 
EOF



In [0]:


unified_s3_path = "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-publisher.v1/fact/" \
                  "granularity=daily/date={}/"

spark.read.format("delta").load(unified_s3_path.format(date))

In [0]:
%%sh

aws s3 cp s3://b2c-prod-data-pipeline-qa/tom/top_publisher/regression_plproxy.txt /tmp/regression_plproxy.txt
cat /tmp/regression_plproxy.txt


In [0]:
%%sh
