In [0]:

spark.sparkContext.addPyFile("/home/hadoop/bdp/application/libs/python/dependencies.zip")
import pandas as pd
pd.set_option('expand_frame_repr', False)


import datetime
def get_date_list(begin_date, end_date, freq):
    date_list = [x.strftime('%Y-%m-%d') for x in list(pd.date_range(start=begin_date, end=end_date, freq=freq))]
    return date_list

def debug(case_list):
    """
    method of debug cases in zepplin, result will be print in standard output

    e.g. test case list example
    TestMarketSizeWeekly(trigger_datetime=t_date, methodName='test_market_size_etl_accuracy_and_completeness')
    """
    std_out_origin = sys.stdout
    std_err_origin = sys.stderr
    try:
        suite = unittest.TestSuite()
        for case in case_list:
            suite.addTest(case)
        runner = unittest.TextTestRunner(verbosity=2, buffer=True)
        runner.run(suite)
    except Exception as ex:
        print dir(ex)
        print ex.message
        traceback.print_exception(type(ex), ex, ex.__traceback__)
    finally:
        sys.stdout = std_out_origin
        sys.stderr = std_err_origin


def load_context_zepplin(spark):
    """
    load context in zepplin
    """
    spark.sparkContext.addPyFile("/home/hadoop/bdp/application/libs/python/dependencies.zip")
    spark.sparkContext.addPyFile("/home/hadoop/bdp/application/code.zip")

def _compare_df(self, df1, df2, log=''):
    for diff_type in ["left", "right"]:
        diff_df = df1.merge(df2, indicator=True, how=diff_type)  # .loc[lambda x : x['_merge']!='both']
        diff_df = diff_df.loc[diff_df["_merge"] != "both"]
        if len(diff_df) != 0:
            print diff_type
            print "dataframe overview of df1 and df2"
            print df1
            print df2
            print "dimension overview of diff df"
            print diff_df.country_code.unique()
            print diff_df.category_id.unique()
            print diff_df.device_code.unique()
        self.assertEqual(len(diff_df), 0,
                         msg="found mismatch when compare the raw, unified, db."
                             " diff count is \n {}, logs:{}".format(len(diff_df), log))
        print "PASS - {}".format(log)
        
import unittest
import datetime
from applications.db_check_v1.common.utils import string_to_datetime, datetime_to_string
from applications.db_check_v1.common.db_check_utils import _get_date_from_refresh_routing_config, etl_skip
from applications.db_check_v1.common.constants import query
from conf.settings import PG_USAGE_HOSTS, PG_USAGE_NAME, PG_USAGE_ACCESS_ID, PG_USAGE_SECRET_KEY, \
    CITUS_USAGE_NAME, CITUS_USAGE_ACCESS_ID, CITUS_USAGE_HOSTS, CITUS_USAGE_SECRET_KEY


PLPROXY_DSN = (
    "dbname='{db}' user='{user}' password='{password}' "
    "host='{host}' port='{port}'".format(
        db=PG_USAGE_NAME,
        user=PG_USAGE_ACCESS_ID,
        host=PG_USAGE_HOSTS[0][0],
        password=PG_USAGE_SECRET_KEY,
        port=PG_USAGE_HOSTS[0][1]
    )
)

CITUS_DSN = (
    "dbname='{db}' user='{user}' password='{password}' "
    "host='{host}' port='{port}'".format(
        db=CITUS_USAGE_NAME,
        user=CITUS_USAGE_ACCESS_ID,
        host=CITUS_USAGE_HOSTS[0][0],
        password=CITUS_USAGE_SECRET_KEY,
        port=CITUS_USAGE_HOSTS[0][1]
    )
)




In [0]:


# Copyright (c) 2018 App Annie Inc. All rights reserved.
# pylint: disable=E1101,C0412,C1801

"""
DB Check modules
"""

import datetime

from applications.db_check_v1.common.base_test import PipelineTest
from applications.db_check_v1.common.db_check_utils import query_df, etl_skip
from applications.db_check_v1.common.constants import COUNTRY_CODE_MAPPING_BY_MARKET_CODE as COUNTRY_CODE_MAPPING, \
    CATEGORY_ID_MAPPING_BY_MARLKET_AND_DEVICE_CODE as CATEGORY_ID_MAPPING
from applications.db_check_v1.common.utils import get_week_start_end_date, get_date_list
from applications.db_check_v1.cases.store.app_est_publisher_v1.constants import PUBLISHER_EST_DSN

# todo need moved to constant
DB_PUBLISHER_EST_METRICS = ["est_free_app_download", "est_paid_app_download", "est_revenue"]

class PublisherEstRawData(object):
    raw_s3_path = "s3://b2c-prod-dca-store-estimates/store_estv2/PUB_ESTIMATES_FINAL/version=2.0.0/range_type=DAY" \
                  "/date={}/"
    device_code_mapping = {
        "00": "android-all",
        "01": "android-all",
        "02": "android-all",
        "10": "ios-phone",
        "11": "ios-phone",
        "12": "ios-phone",
        "1100": "ios-tablet",
        "1101": "ios-tablet",
        "1102": "ios-tablet",
        # "21000": "ios-all",
        # "21001": "ios-all",
        # "21002": "ios-all",
    }

    metric_mapping = {
        0: "free_app_download",
        1: "paid_app_download",
        2: "revenue",
        101: "free_app_download",
        100: "paid_app_download",
        102: "revenue",
        # 1000: "free_app_download",
        # 1001: "paid_app_download",
        # 1002: "revenue"
    }

    dimension_mapping = {
        "id": "publisher_id",
    }

    def __init__(self, spark):
        self.spark = spark

    def get(self, date, country_code):
        df = self._get_raw_data_by_date_country(date, country_code)
        df = self._parse_mapping(df)
        df = self._parse_unified_format(df)
        df = self._data_clean_up(df)
        return df

    def _data_clean_up(self, df):
        # clean unknown mapping
        category_id_list = list(set(CATEGORY_ID_MAPPING["apple-store"]["ios-all"].values() +
                                    CATEGORY_ID_MAPPING["google-play"]["android-all"].values()))

        country_code_list = list(set(COUNTRY_CODE_MAPPING["apple-store"].values() +
                                     COUNTRY_CODE_MAPPING["google-play"].values()))

        df = df[(df['category_id'].isin(category_id_list)) & (df['country_code'].isin(country_code_list))]
        return df

    def _parse_mapping(self, df):
        # country_code mapping
        df.loc[df["platform_id"] == 0] = df.loc[df["platform_id"] == 0].\
            replace({"store_id": COUNTRY_CODE_MAPPING["google-play"]})
        df.loc[df["platform_id"] == 1] = df.loc[df["platform_id"] == 1].\
            replace({"store_id": COUNTRY_CODE_MAPPING["apple-store"]})
        df = df.rename(columns={'store_id': 'country_code'})

        # category_id mapping
        df.loc[df["platform_id"] == 0] = df.loc[df["platform_id"] == 0].\
            replace({"category_id": CATEGORY_ID_MAPPING["google-play"]["android-all"]})
        df.loc[df["platform_id"] == 1] = df.loc[df["platform_id"] == 1].\
            replace({"category_id": CATEGORY_ID_MAPPING["apple-store"]["ios-all"]})

        # device_code mapping
        df["device_code"] = df["platform_id"].astype(str) + df["feed"].astype(str)
        df = df.replace({"device_code": self.device_code_mapping})

        # granularity
        df["granularity"] = "daily"

        # metrics mapping (from feed)
        df = df.replace({"feed": self.metric_mapping})
        return df

    def _parse_unified_format(self, df):
        df = df.rename(columns=self.dimension_mapping)
        df = df.pivot_table(index=["publisher_id", "category_id", "device_code", "country_code", "granularity"],
                            columns='feed', values='est')
        df.reset_index(inplace=True)
        df.columns.name = None
        return df

    def _get_raw_data_by_date_country(self, date, country_code):
        """
        +----------+--------+-----------+-----------+--------+----+----+-----+--------+
        |        id|store_id|category_id|platform_id|vertical|rank|feed|  est|platform|
        +----------+--------+-----------+-----------+--------+----+----+-----+--------+
        | 284417353|       0|       6006|          1|       1|   1|1002|45235|     ios|
        | 349554266|       0|       6006|          1|       1|   2|1002|20732|     ios|
        |1316153435|       0|       6006|          1|       1|   3|1002|15136|     ios|
        +----------+--------+-----------+-----------+--------+----+----+-----+--------+
        """
        ios_store_ids = [str(k) for k, v in COUNTRY_CODE_MAPPING["apple-store"].items() if v == country_code]
        gp_store_ids = [str(k) for k, v in COUNTRY_CODE_MAPPING["google-play"].items() if v == country_code]
        raw_df = self.spark.read.parquet(self.raw_s3_path.format(date)).\
            filter('store_id in ({})'.format(",".join(ios_store_ids + gp_store_ids))).toPandas()
        return raw_df

    def get_metrics_count(self, date):
        ios_store_id = [str(s_id) for s_id in COUNTRY_CODE_MAPPING["apple-store"].keys()]
        gp_store_id = [str(s_id) for s_id in COUNTRY_CODE_MAPPING["google-play"].keys()]
        # ios_category_id = [str(c_id) for c_id in CATEGORY_ID_MAPPING["apple-store"]["ios-all"].keys()]
        # gp_category_id = [str(c_id) for c_id in CATEGORY_ID_MAPPING["google-play"]["android-all"].keys()]

        fillter_sql = "platform_id = {} and store_id in ({}) and feed in ({})"
        df = self.spark.read.parquet(self.raw_s3_path.format(date))
        feed_ids_sql = ",".join([str(x) for x in self.metric_mapping.keys()])

        count_all = df.filter(fillter_sql.format(1, ",".join(ios_store_id), feed_ids_sql)).count() + \
                    df.filter(fillter_sql.format(0, ",".join(gp_store_id), feed_ids_sql)).count()
        return count_all

    def get_v1_raw_metrics_count(self, date):
        ios_store_id = [str(s_id) for s_id in COUNTRY_CODE_MAPPING["apple-store"].keys()]
        gp_store_id = [str(s_id) for s_id in COUNTRY_CODE_MAPPING["google-play"].keys()]

        df = self.spark.read.option("delimiter", "\t").csv(
            "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{}/*/sbe_est_publisher/*/*.csv.gz".format(date))
        fillter_sql = "_c2 = {} and _c0 in ({})"  # _c2 > platform_id, _c0 > store_id

        count_all = df.filter(fillter_sql.format(1, ",".join(ios_store_id))).count() + \
                    df.filter(fillter_sql.format(0, ",".join(gp_store_id))).count()
        return count_all


class PublisherEstUnifiedData(object):
    unified_s3_path = "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-publisher-dna-log.v1/" \
                      "fact/granularity=daily/date={}/"
    available_device_codes = ['ios-phone', 'ios-tablet', 'android-all']

    def __init__(self, spark):
        self.spark = spark

    def get(self, date, country_code):
        unified_df = self.spark.read.format("delta").\
            load(self.unified_s3_path.format(date)).filter("country_code = '{}'".format(country_code)).toPandas()
        unified_df = unified_df.drop(["_identifier", "revenue_iap", "revenue_non_iap", "date"], axis=1)
        return unified_df

    def get_metrics_count(self, date):
        df = self.spark.read.format("delta").load(self.unified_s3_path.format(date))
        metrics_count = 0
        device_code_list_sql = "','".join(self.available_device_codes)
        for metric in DB_PUBLISHER_EST_METRICS:
            metrics_count += df.filter("device_code in ('{}') and {} is not null".format(device_code_list_sql,
                                                                                       metric)).count()
        return metrics_count


class PublisherEstDBData(object):
    def get(self, date):
        sql = "SELECT * FROM store.store_est_publisher_fact_v2 WHERE date='{}'".format(date)
        return query_df(PUBLISHER_EST_DSN, sql)

    def get_metrics_count(self, date):
        metrics_count = 0
        for metric in DB_PUBLISHER_EST_METRICS:
            sql = "SELECT count(*) AS metrics_count FROM store.store_est_publisher_fact_v2 WHERE date='{}' AND {} IS NOT NULL".format(date, metric)
            data = query_df(PUBLISHER_EST_DSN, sql)
            metrics_count += data.loc[0].metrics_count
        return metrics_count


class TestIOSPublisherEstWeekly(PipelineTest):
    # Every Monday 16:00 UTC time will refresh the data of last Full Week.
    trigger_date_config = ('* 10 * * 1', 2)

    def _compare_df(self, df1, df2, log=''):
        for diff_type in ["left", "right"]:
            diff_df = df1.merge(df2, indicator=True, how=diff_type)  # .loc[lambda x : x['_merge']!='both']
            diff_df = diff_df.loc[diff_df["_merge"] != "both"]
            if len(diff_df) != 0:
                print diff_type
                print "dataframe overview of df1 and df2"
                print df1
                print df2
                print "dimension overview of diff df"
                print diff_df.country_code.unique()
                print diff_df.category_id.unique()
                print diff_df.device_code.unique()
            self.assertEqual(len(diff_df), 0,
                             msg="found mismatch when compare the raw, unified, db."
                                 " diff count is \n {}, logs:{}".format(len(diff_df), log))

    # @etl_skip()
    # def test_publisher_est_etl_accuracy(self):
    #     # Every Tuesday 16:00 UTC time will refresh the data of last Full Week.
    #     country_code = 'US'
    #     start_date, end_date = get_week_start_end_date(self.check_date_str)
    #     date_list = get_date_list(start_date, end_date)
    #     for date in date_list:
    #         raw_df = PublisherEstRawData(self.spark).get(date, country_code)
    #         unified_df = PublisherEstUnifiedData(self.spark).get(date, country_code)
    #         db_df = PublisherEstDBData().get(date)
    #
    #         self._compare_df(raw_df, unified_df, log="raw / unified - {}".format(date))
    #         self._compare_df(unified_df, db_df, log="unified / db - {}".format(date))

    @etl_skip()
    def test_publisher_est_etl_completeness(self):
        start_date, end_date = get_week_start_end_date(self.check_date_str)
        date_list = get_date_list(start_date, end_date)
        for date in date_list:
            raw_count = PublisherEstRawData(self.spark).get_metrics_count(date)
            unified_count = PublisherEstUnifiedData(self.spark).get_metrics_count(date)
            db_count = PublisherEstDBData().get_metrics_count(date)
            self.assertEqual(raw_count, unified_count)
            self.assertEqual(raw_count, db_count)
            self.assertTrue(db_count>0)

    def test_publisher_est_etl_timelines(self):
        # Every Monday 10:00 UTC(18:00 BJ) time will refresh the data of last Full Week.
        # E.g. 2020-02-10 10:00 the data of 2020-02-02 ~ 2020-02-08 will be ready
        trigger_datetime = datetime.datetime.strptime("2020-02-10 10:00:00", '%Y-%m-%d %H:%M:%S')
        check_date_str_actual = self._get_check_date_from_routing_config(trigger_datetime).strftime("%Y-%m-%d")
        self.assertEqual("2020-02-08", check_date_str_actual)


class TestGPPublisherEstWeekly(TestIOSPublisherEstWeekly):
    # Every Tuesday 10:00 UTC(18:00 BJ) time will refresh the data of last Full Week.
    trigger_date_config = ('* 10 * * 2', 2)

    def test_publisher_est_etl_timelines(self):
        # Every Tuesday 10:00 UTC(18:00 BJ) time will refresh the data of last Full Week.
        # E.g. 2020-02-11 10:00 the data of 2020-02-02 ~ 2020-02-08 will be ready
        trigger_datetime = datetime.datetime.strptime("2020-02-11 10:00:00", '%Y-%m-%d %H:%M:%S')
        check_date_str_actual = self._get_check_date_from_routing_config(trigger_datetime).strftime("%Y-%m-%d")
        self.assertEqual("2020-02-08", check_date_str_actual)


In [0]:


date = "2020-03-02"

begin_date = datetime.datetime(2020, 6, 1)
end_date = datetime.datetime(2020, 7, 22)

date_list = get_date_list(begin_date, end_date, "D")

for date in date_list:
    
    count1 = PublisherEstRawData(spark).get_metrics_count(date)
    # count1 = PublisherEstRawData(spark).get_v1_raw_metrics_count(date)
    
    count2 = PublisherEstUnifiedData(spark).get_metrics_count(date)

    count3 = PublisherEstDBData().get_metrics_count(date)
    print "{}, {}, {}, {}, {}".format(date, count1+count2-count3*2, count1, count2, count3 )


In [0]:

# spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-publisher.v1/fact/granularity=daily/date=2020-02-05/device_code=ios-phone/").show(10)
spark.read.format("parquet").load("s3://b2c-prod-dca-store-estimates/store_estv2/PUB_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/date=2020-05-01/platform=ios/").filter("platform_id=1").show(10)

In [0]:
%%sh

#aws  s3 ls s3://b2c-prod-data-pipeline-unified-store-paid/unified/ | grep app-est-publisher
aws  s3 ls s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-publisher-dna-log.v1/fact/granularity=daily/

# aws s3 ls s3://b2c-prod-dca-store-estimates/store_estv2/PUB_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/date=2020-05-01/platform=ios/

In [0]:

from applications.db_check_v1.common.db_check_utils import query
from applications.db_check_v1.common.utils import get_date_list
import datetime
from applications.db_check_v1.cases.usage.basic_kpi_v3.test_basic_kpi_v3_routine_plproxy import CITUS_DSN

PUBLISHER_EST_DSN = CITUS_DSN

checkdate = datetime.datetime.now()

granularity_db_mapping = {
    "daily": "store_est_publisher_fact_v2",
    "weekly": "store_est_publisher_t_w_fact_v2",
    "monthly": "store_est_publisher_t_m_fact_v2",
    "quarterly": "store_est_publisher_t_q_fact_v2",
    "yearly": "store_est_publisher_t_y_fact_v2"
}

granularity_check_rules = {
    "weekly": "daily",
    "monthly": "daily",
    "quarterly": "monthly",
    "yearly": "quarterly"
}

granularity_freq_mapping = {
    "daily": "D",
    "weekly": "W-SAT",
    "monthly": "M",
    "quarterly": "Q",
    "yearly": "Y"
}


def check_aggr_by_sum_period(check_date, granularity):
    # start date of publisher data
    start_date = datetime.datetime(2010, 7, 4)

    date_list = get_date_list(start_date, check_date, freq=granularity_freq_mapping[granularity])
    start_date = (datetime.datetime.strptime(date_list[-2], '%Y-%m-%d') + datetime.timedelta(days=1))\
        .strftime('%Y-%m-%d')
    end_date = date_list[-1]
    sql = "SELECT sum(est_revenue) FROM store.{table}  where date between '{start}' and '{end}';"
    compare_table = granularity_db_mapping[granularity_check_rules[granularity]]
    compare_sql = sql.format(table=compare_table, start=start_date, end=end_date)

    actual_table = granularity_db_mapping[granularity]
    actual_sql = sql.format(table=actual_table, start=start_date, end=end_date)


    print "{} - {} - PASS - {}".format(check_date, granularity, query(PUBLISHER_EST_DSN,actual_sql))
    # c_count = query(PUBLISHER_EST_DSN, compare_sql)
    # a_count = query(PUBLISHER_EST_DSN,actual_sql)
    # if c_count == a_count:
    #     print "{} - {} - PASS - {}".format(check_date, granularity, a_count)
    # else:
    #     print "{} - {} - FAIL".format(check_date, granularity)
    #     print compare_sql
    #     print actual_sql




In [0]:

my_start_date = datetime.datetime(2010, 7, 4)
my_end_date =  datetime.datetime(2020, 8, 4)

for granularity in ["monthly"]:
    print "*"*200
    print granularity * 10
    for my_check_date in get_date_list(my_start_date, my_end_date, freq=granularity_freq_mapping[granularity])[1:]:
        check_aggr_by_sum_period(my_check_date, granularity)



