In [0]:

spark.sparkContext.addPyFile("/home/hadoop/bdp/application/libs/python/dependencies.zip")
spark.sparkContext.addPyFile("/home/hadoop/bdp/application/code.zip")
import pandas as pd
pd.set_option('expand_frame_repr', False)




In [0]:

from bdce.common.utils import update_application_code
update_application_code(
    spark, role="BDP-PROD-APP-INT-QA", application_name="aa-int-qa-db-check"
)
spark.sparkContext.addPyFile("/tmp/zeppelin_application_code/libs/python/dependencies.zip")


In [0]:


from applications.db_check_v1.db_check import send_message
send_message()

In [0]:


# Copyright (c) 2018 App Annie Inc. All rights reserved.
# pylint: disable=E1101,C0412

# Copyright (c) 2018 App Annie Inc. All rights reserved.
# pylint: disable=E1101,C0412

"""
DB Check modules
"""

import datetime
from applications.db_check_v1.common.base_test import PipelineTest
from applications.db_check_v1.common.db_check_utils import query_df, _get_pre_date_from_refresh_routing_config, _get_date_from_refresh_routing_config
from pyspark.sql.types import LongType, IntegerType, StringType, StructType, StructField
from applications.db_check_v1.common.constants import COUNTRY_CODE_MAPPING_BY_MARKET_CODE as COUNTRY_CODE_MAPPING, \
    CATEGORY_ID_MAPPING_BY_MARLKET_AND_DEVICE_CODE as CATEGORY_ID_MAPPING
from applications.db_check_v1.common.utils import get_week_start_end_date
from applications.db_check_v1.cases.store.market_size_v1.constants import MARKET_SIZE_DSN
import pandas as pd


from functools import wraps


def etl_skip(delta_days=1):
    def inner_function(func):
        @wraps(func)
        def wrapper(self, *args, **kwargs):
            pre_etl_date = _get_pre_date_from_refresh_routing_config( self.routing_config)
            trigger_datetime = self.trigger_datetime or datetime.datetime.utcnow()
            skipped_condition = trigger_datetime - pre_etl_date > datetime.timedelta(days=delta_days)
            skipped_reason = "The trigger time {} is not in the {} * 24hrs after the ETL completed time {}.".format(
                trigger_datetime, delta_days, pre_etl_date
            )

            @skipIf(skipped_condition, skipped_reason)
            def inner_func(self, *args, **kwargs):
                return func(self, *args, **kwargs)

            inner_func(self, *args, **kwargs)
        return wrapper
    return inner_function


class MarketSizeRawData(object):
    raw_s3_path = "s3://b2c-prod-dca-store-estimates/store_estv2/MARKET_SIZE_ESTIMATES_FINAL/" \
                  "version=2.0.0/range_type=DAY/date={date}/"
    device_code_mapping = {
        "0google-play": "android-all",
        "1ios": "ios-all",
        "1ipad": "ios-tablet",
        "1iphone": "ios-phone",
    }

    metric_mapping = {
        "downloads": "est_market_size_download",
        "revenue": "est_market_size_revenue"
    }

    dimension_mapping = {
        "price_type": "app_price_type_id",
        "purchase_type": "purchase_type_id"
    }

    def __init__(self, spark):
        self.spark = spark

    def get(self, date):
        df = self._get_raw_data_by_date(date)
        df = self._parse_mapping(df)
        df = self._parse_unified_format(df)
        df = self._data_clean_up(df)
        return df

    def _data_clean_up(self, df):
        # clean unknown mapping
        category_id_list = list(set(CATEGORY_ID_MAPPING["apple-store"]["ios-all"].values() +
                                    CATEGORY_ID_MAPPING["google-play"]["android-all"].values()))

        country_code_list = list(set(COUNTRY_CODE_MAPPING["apple-store"].values() +
                                     COUNTRY_CODE_MAPPING["google-play"].values()))

        df = df[(df['category_id'].isin(category_id_list)) & (df['country_code'].isin(country_code_list))]
        return df

    def _parse_mapping(self, df):
        # country_code mapping
        df.loc[df["platform_id"] == 0] = df.loc[df["platform_id"] == 0].\
            replace({"store_id": COUNTRY_CODE_MAPPING["google-play"]})
        df.loc[df["platform_id"] == 1] = df.loc[df["platform_id"] == 1].\
            replace({"store_id": COUNTRY_CODE_MAPPING["apple-store"]})
        df = df.rename(columns={'store_id': 'country_code'})

        # category_id mapping
        df.loc[df["platform_id"] == 0] = df.loc[df["platform_id"] == 0].\
            replace({"category_id": CATEGORY_ID_MAPPING["google-play"]["android-all"]})
        df.loc[df["platform_id"] == 1] = df.loc[df["platform_id"] == 1].\
            replace({"category_id": CATEGORY_ID_MAPPING["apple-store"]["ios-all"]})

        # device_code mapping
        df["device_code"] = df["platform_id"].astype(str) + df["device"]
        df = df.replace({"device_code": self.device_code_mapping})
        return df

    def _parse_unified_format(self, df):
        df = df.rename(columns=self.dimension_mapping)
        df = df.pivot_table(index=["app_price_type_id", "purchase_type_id", "category_id",
                                   "device_code", "country_code"], columns='data_type', values='estimate')
        df.reset_index(inplace=True)
        df.columns.name = None
        df = df.rename(columns=self.metric_mapping)
        return df

    def _get_raw_data_by_date(self, date):
        """
        +--------+----------+-----------+-----------+---------+----------+-------------+-----------+--------+
        |store_id|      date|platform_id|     device|data_type|price_type|purchase_type|category_id|estimate|
        +--------+----------+-----------+-----------+---------+----------+-------------+-----------+--------+
        |      10|2020-01-10|          0|google-play|downloads|         1|           10|          1|11981138|
        +--------+----------+-----------+-----------+---------+----------+-------------+-----------+--------+
        """
        schema = StructType([
            StructField("store_id", IntegerType(), False),
            StructField("date", StringType(), False),
            StructField("platform_id", IntegerType(), False),
            StructField("device", StringType(), False),
            StructField("data_type", StringType(), False),
            StructField("price_type", IntegerType(), False),
            StructField("purchase_type", IntegerType(), False),
            StructField("category_id", IntegerType(), False),
            StructField("estimate", LongType(), False)
        ])
        raw_df = self.spark.read.parquet(self.raw_s3_path.format(date=date))
        return raw_df.toPandas()

    def get_rank_count_and_sum_by_date(self, date):
        pass
        # return raw_count, raw_sum


class MarketSizeUnifiedData(object):
    unified_s3_path = "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.market-size.v1/fact/" \
                      "granularity=daily/date={}/"

    def __init__(self, spark):
        self.spark = spark

    def get(self, date):
        unified_df = self.spark.read.parquet(self.unified_s3_path.format(date)).toPandas()
        unified_df = unified_df.drop(["_identifier"], axis=1)
        return unified_df


class MarketSizeDBData(object):
    def get(self, date):
        sql = "SELECT * FROM store.store_market_size_fact_v1  WHERE date='{}'".format(date)
        return query_df(MARKET_SIZE_DSN, sql)


class TestMarketSizeDaily(PipelineTest):
    # Every Tuesday 16:00 UTC time will refresh the data of last Full Week.
    routing_config = ('* 16 * * 2', 3)
    
    def __init__(self, methodName='runTest', trigger_datetime=None):
        super(TestMarketSizeDaily, self).__init__(methodName, trigger_datetime)

    def _compare_df(self, df1, df2):
        for diff_type in ["left", "right"]:
            diff_df = df1.merge(df2, indicator=True, how=diff_type)  # .loc[lambda x : x['_merge']!='both']
            diff_df = diff_df.loc[diff_df["_merge"] != "both"]
            if len(diff_df) != 0:
                print diff_type
                print diff_df.country_code.unique()
                print diff_df.category_id.unique()
                print diff_df.device_code.unique()
            self.assertEqual(len(diff_df), 0)

    @etl_skip(delta_days=1)
    def test_market_size_etl_accuracy_and_completeness(self):
        # Every Tuesday 16:00 UTC time will refresh the data of last Full Week.
        start_date, end_date = get_week_start_end_date(self.check_date_str)
        date_list = get_date_list(start_date, end_date)
        for date in date_list:
            print date
            raw_df = MarketSizeRawData(self.spark).get(date)
            unified_df = MarketSizeUnifiedData(self.spark).get(date)
            db_db = MarketSizeDBData().get(date)

            self._compare_df(raw_df, unified_df)
            self._compare_df(unified_df, db_db)
        self.assertEqual(1, 0)

    def test_market_size_etl_timelines(self):
        # Every Tuesday 16:00 UTC time will refresh the data of last Full Week.
        # E.g. 2020-02-11 17:00 the data of 2020-02-02 ~ 2020-02-08 will be ready
        trigger_datetime = datetime.datetime.strptime("2020-02-11 17:00:00", '%Y-%m-%d %H:%M:%S')
        check_date_str_actual = self._get_check_date_from_routing_config(trigger_datetime).strftime("%Y-%m-%d")
        self.assertEqual("2020-02-08", check_date_str_actual)


def get_date_list(start_date, end_date, freq="D"):
    date_list = [x.strftime('%Y-%m-%d') for x in list(pd.date_range(start=start_date, end=end_date, freq=freq))]
    return date_list



In [0]:


# raw_df = MarketSizeRawData(spark)._get_raw_data_by_date("2019-12-09")

print raw_df

raw_df[raw_df[store_id]==]


In [0]:


# Copyright (c) 2018 App Annie Inc. All rights reserved.
# pylint: disable=E1101,C0412,C1801

"""
DB Check modules
"""

import datetime
from applications.db_check_v1.common.base_test import PipelineTest
from applications.db_check_v1.common.db_check_utils import query_df, etl_skip
from applications.db_check_v1.common.constants import COUNTRY_CODE_MAPPING_BY_MARKET_CODE as COUNTRY_CODE_MAPPING, \
    CATEGORY_ID_MAPPING_BY_MARLKET_AND_DEVICE_CODE as CATEGORY_ID_MAPPING
from applications.db_check_v1.common.utils import get_week_start_end_date, get_date_list
from applications.db_check_v1.cases.store.market_size_v1.constants import MARKET_SIZE_DSN


class MarketSizeRawData(object):
    raw_s3_path = "s3://b2c-prod-dca-store-estimates/store_estv2/MARKET_SIZE_ESTIMATES_FINAL/" \
                  "version=2.0.0/range_type=DAY/date={date}/"
    device_code_mapping = {
        "0google-play": "android-all",
        "1ios": "ios-all",
        "1ipad": "ios-tablet",
        "1iphone": "ios-phone",
    }

    metric_mapping = {
        "downloads": "est_market_size_download",
        "revenue": "est_market_size_revenue"
    }

    dimension_mapping = {
        "price_type": "app_price_type_id",
        "purchase_type": "purchase_type_id"
    }

    def __init__(self, spark):
        self.spark = spark

    def get(self, date):
        df = self._get_raw_data_by_date(date)
        df = self._parse_mapping(df)
        df = self._parse_unified_format(df)
        df = self._data_clean_up(df)
        return df

    def _data_clean_up(self, df):
        # clean unknown mapping
        category_id_list = list(set(CATEGORY_ID_MAPPING["apple-store"]["ios-all"].values() +
                                    CATEGORY_ID_MAPPING["google-play"]["android-all"].values()))

        country_code_list = list(set(COUNTRY_CODE_MAPPING["apple-store"].values() +
                                     COUNTRY_CODE_MAPPING["google-play"].values()))

        df = df[(df['category_id'].isin(category_id_list)) & (df['country_code'].isin(country_code_list))]
        return df

    def _parse_mapping(self, df):
        # country_code mapping
        df.loc[df["platform_id"] == 0] = df.loc[df["platform_id"] == 0].\
            replace({"store_id": COUNTRY_CODE_MAPPING["google-play"]})
        df.loc[df["platform_id"] == 1] = df.loc[df["platform_id"] == 1].\
            replace({"store_id": COUNTRY_CODE_MAPPING["apple-store"]})
        df = df.rename(columns={'store_id': 'country_code'})

        # category_id mapping
        df.loc[df["platform_id"] == 0] = df.loc[df["platform_id"] == 0].\
            replace({"category_id": CATEGORY_ID_MAPPING["google-play"]["android-all"]})
        df.loc[df["platform_id"] == 1] = df.loc[df["platform_id"] == 1].\
            replace({"category_id": CATEGORY_ID_MAPPING["apple-store"]["ios-all"]})

        # device_code mapping
        df["device_code"] = df["platform_id"].astype(str) + df["device"]
        df = df.replace({"device_code": self.device_code_mapping})
        return df

    def _parse_unified_format(self, df):
        df = df.rename(columns=self.dimension_mapping)
        df = df.pivot_table(index=["app_price_type_id", "purchase_type_id", "category_id",
                                   "device_code", "country_code"], columns='data_type', values='estimate')
        df.reset_index(inplace=True)
        df.columns.name = None
        df = df.rename(columns=self.metric_mapping)
        return df

    def _get_raw_data_by_date(self, date):
        """
        +--------+----------+-----------+-----------+---------+----------+-------------+-----------+--------+
        |store_id|      date|platform_id|     device|data_type|price_type|purchase_type|category_id|estimate|
        +--------+----------+-----------+-----------+---------+----------+-------------+-----------+--------+
        |      10|2020-01-10|          0|google-play|downloads|         1|           10|          1|11981138|
        +--------+----------+-----------+-----------+---------+----------+-------------+-----------+--------+
        """
        # schema = StructType([
        #     StructField("store_id", IntegerType(), False),
        #     StructField("date", StringType(), False),
        #     StructField("platform_id", IntegerType(), False),
        #     StructField("device", StringType(), False),
        #     StructField("data_type", StringType(), False),
        #     StructField("price_type", IntegerType(), False),
        #     StructField("purchase_type", IntegerType(), False),
        #     StructField("category_id", IntegerType(), False),
        #     StructField("estimate", LongType(), False)
        # ])
        raw_df = self.spark.read.parquet(self.raw_s3_path.format(date=date))
        return raw_df.toPandas()

    def get_rank_count_and_sum_by_date(self, date):
        pass
        # return raw_count, raw_sum


class MarketSizeUnifiedData(object):
    unified_s3_path = "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.market-size.v1/fact/" \
                      "granularity=daily/date={}/"

    def __init__(self, spark):
        self.spark = spark

    def get(self, date):
        unified_df = self.spark.read.parquet(self.unified_s3_path.format(date)).toPandas()
        unified_df = unified_df.drop(["_identifier"], axis=1)
        return unified_df


class MarketSizeDBData(object):
    def get(self, date):
        sql = "SELECT * FROM store.store_market_size_fact_v1 WHERE date='{}'".format(date)
        return query_df(MARKET_SIZE_DSN, sql)

    def get_led(self):
        sql = "SELECT * FROM store.store_market_size_latest_date_v1"
        return query_df(MARKET_SIZE_DSN, sql)


class TestMarketSizeWeekly(PipelineTest):
    # Every Tuesday 16:00 UTC time will refresh the data of last Full Week.
    trigger_date_config = ('* 16 * * 2', 3)

    def _compare_df(self, df1, df2, log=''):
        for diff_type in ["left", "right"]:
            diff_df = df1.merge(df2, indicator=True, how=diff_type)  # .loc[lambda x : x['_merge']!='both']
            diff_df = diff_df.loc[diff_df["_merge"] != "both"]
            if len(diff_df) != 0:
                print diff_type
                print diff_df.country_code.unique()
                print diff_df.category_id.unique()
                print diff_df.device_code.unique()
                print df1
                print df2
            self.assertEqual(len(diff_df), 0,
                             msg="found mismatch when compare the raw, unified, db."
                                 " diff count is \n {}, logs:{}".format(len(diff_df), log))

    @etl_skip()
    def test_market_size_etl_accuracy_and_completeness(self):
        # Every Tuesday 16:00 UTC time will refresh the data of last Full Week.
        start_date, end_date = get_week_start_end_date(self.check_date_str)
        date_list = get_date_list(start_date, end_date)
        for date in date_list:
            raw_df = MarketSizeRawData(self.spark).get(date)
            unified_df = MarketSizeUnifiedData(self.spark).get(date)
            db_df = MarketSizeDBData().get(date)

            self._compare_df(raw_df, unified_df, log="raw / unified")
            self._compare_df(unified_df, db_df, log="unified / db")

    def test_market_size_etl_timelines(self):
        # Every Tuesday 16:00 UTC time will refresh the data of last Full Week.
        # E.g. 2020-02-11 17:00 the data of 2020-02-02 ~ 2020-02-08 will be ready
        trigger_datetime = datetime.datetime.strptime("2020-02-11 17:00:00", '%Y-%m-%d %H:%M:%S')
        check_date_str_actual = self._get_check_date_from_routing_config(trigger_datetime).strftime("%Y-%m-%d")
        self.assertEqual("2020-02-08", check_date_str_actual)

    @etl_skip()
    def test_market_size_led(self):
        _, expected_led_date = get_week_start_end_date(self.check_date_str)
        print self.check_date_str
        print get_week_start_end_date(self.check_date_str)
        led_df = MarketSizeDBData().get_led()
        for device_code in ["ios-all", "android-all", "ios-tablet", "ios-phone"]:
            actual_led_date = led_df.loc[led_df.device_code == device_code].date.values[0].strftime("%Y-%m-%d")
            print "-{}-".format(actual_led_date)
            print dir(actual_led_date)

            print "-{}-".format(expected_led_date)
            print dir(expected_led_date)
           
            assert expected_led_date==actual_led_date
            self.assertEqual(expected_led_date, actual_led_date,
                             msg="current led for device : {} is {}, it should be {}".format(
                                 device_code, actual_led_date, expected_led_date))


In [0]:


from applications.db_check_v1.cases.store.market_size_v1.test_market_size_v1 import TestMarketSizeWeekly

#
from applications.db_check_v1.cases.advanced_review.test_advainced_review import TestAdvancedReview
from applications.db_check_v1.cases.aso.test_aso import TestASO, TestASOSOVDataDaily, \
    TestASOSOVDataWeekly, TestASOSOVDataMonthly, TestASOMetrics
from applications.db_check_v1.cases.market.test_market import TestMarketDimensionDaily, \
    TestMarketLogsFactAndSeenDaily, TestMarketMonthlyCheck, TestMarketWeeklyCheck
from applications.db_check_v1.cases.mobile_web.test_mobile_web import TestMobileWebDaily, \
    TestMobileWebWeekly, \
    TestMobileWebMonthly, TestMobileWebRetention
from applications.db_check_v1.cases.sdk.test_sdk import TestSDKDaily, TestSDKWeekly, TestSDKMonthly
from applications.db_check_v1.cases.store.app_rank_v1.test_app_rank_v1 import TestAppStoreRankDaily
from applications.db_check_v1.cases.store.app_rank_v1.test_app_store_rank import TestAppStoreRank
from applications.db_check_v1.cases.store.download_attribution.test_download_attribution import \
    TestDownloadAttribution
from applications.db_check_v1.cases.store.market_size_v1.test_market_size_v1 import TestMarketSizeWeekly
from applications.db_check_v1.cases.usage.test_city_level import TestCityLevelWeekly, TestCityLevelMonthly
from applications.db_check_v1.common.html_report_test_runner import HTMLTestRunner
#

import sys
import datetime
import traceback
import unittest

def debug(case_list):
    std_out_origin= sys.stdout
    std_err_origin= sys.stderr
    try:
        suite =  unittest.TestSuite()
        for case in case_list:
            suite.addTest(case)
        runner = unittest.TextTestRunner(verbosity=2, buffer=True)
        runner.run(suite)
    except Exception as ex:
        print dir(ex)
        print ex.message
        traceback.print_exception(type(ex), ex, ex.__traceback__)
    finally:
        sys.stdout = std_out_origin
        sys.stderr = std_err_origin
    
case_name_list = [
    "test_market_size_etl_accuracy_and_completeness",
    "test_market_size_etl_timelines",
    "test_market_size_led"
    ]

# TestMarketDimensionDaily(trriger_date='2020-09-01')
test_case_list = [ ] # pass 
test_case_list.append(unittest.TestLoader().loadTestsFromTestCase(TestMarketDimensionDaily))
test_case_list.append(unittest.TestLoader().loadTestsFromTestCase(TestMarketLogsFactAndSeenDaily))
test_case_list.append(unittest.TestLoader().loadTestsFromTestCase(TestMarketWeeklyCheck))
test_case_list.append(unittest.TestLoader().loadTestsFromTestCase(TestMarketMonthlyCheck))


debug(test_case_list)


In [0]:


date="2020-01-01"
est_unified_df = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.genre-est.v1/fact/granularity=daily/date={}/".format(date))
est_unified_df.show(5)

In [0]:
%%sh

aws s3 ls s3://b2c-prod-dca-store-estimates/store_estv2/MARKET_SIZE_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/date=2020-01-13/


In [0]:
%%sh
