In [0]:

spark.sparkContext.addPyFile("/home/hadoop/bdp/application/libs/python/dependencies.zip")
import pandas as pd
pd.set_option('expand_frame_repr', False)
import datetime
from applications.db_check_v1.common.base_test import PipelineTest
from applications.db_check_v1.common.db_check_utils import query_df, etl_skip
from applications.db_check_v1.common.constants import COUNTRY_CODE_MAPPING_BY_MARKET_CODE as COUNTRY_CODE_MAPPING, \
    CATEGORY_ID_MAPPING_BY_MARLKET_AND_DEVICE_CODE as CATEGORY_ID_MAPPING
from applications.db_check_v1.common.utils import get_week_start_end_date, get_date_list
#from applications.db_check_v1.cases.store.publisher_est_v1.constants import MARKET_SIZE_DSN

def get_date_list(begin_date, end_date, freq):
    date_list = [x.strftime('%Y-%m-%d') for x in list(pd.date_range(start=begin_date, end=end_date, freq=freq))]
    return date_list

import boto3
s3 = boto3.resource('s3')
s3object = s3.Object('b2c-prod-data-pipeline-qa', 'tom/top_publisher/regression5.txt')

def write_log(strobj, s3obj):
    s3obj.put(Body=str(strobj))


class PublisherEstRawData(object):
    raw_s3_path = "s3://b2c-prod-dca-store-estimates/store_estv2/PUB_ESTIMATES_FINAL/version=2.0.0/range_type=DAY" \
                  "/date={}/"
    device_code_mapping = {
        "00": "android-all",
        "01": "android-all",
        "02": "android-all",
        "10": "ios-phone",
        "11": "ios-phone",
        "12": "ios-phone",
        "1100": "ios-tablet",
        "1101": "ios-tablet",
        "1102": "ios-tablet",
        "11000": "ios-all",
        "11001": "ios-all",
        "11002": "ios-all",
    }

    metric_mapping = {
        0: "free_app_download",
        1: "paid_app_download",
        2: "revenue",
        101: "free_app_download",
        100: "paid_app_download",
        102: "revenue",
        1000: "free_app_download",
        1001: "paid_app_download",
        1002: "revenue"
    }

    dimension_mapping = {
        "id": "publisher_id",
    }

    def __init__(self, spark):
        self.spark = spark

    def get(self, date, country_code):
        df = self._get_raw_data_by_date_country(date, country_code)
        df = self._parse_mapping(df)
        df = self._parse_unified_format(df)
        df = self._data_clean_up(df)
        return df

    def _data_clean_up(self, df):
        # clean unknown mapping
        category_id_list = list(set(CATEGORY_ID_MAPPING["apple-store"]["ios-all"].values() +
                                    CATEGORY_ID_MAPPING["google-play"]["android-all"].values()))

        country_code_list = list(set(COUNTRY_CODE_MAPPING["apple-store"].values() +
                                     COUNTRY_CODE_MAPPING["google-play"].values()))

        df = df[(df['category_id'].isin(category_id_list)) & (df['country_code'].isin(country_code_list))]
        return df

    def _parse_mapping(self, df):
        # country_code mapping
        df.loc[df["platform_id"] == 0] = df.loc[df["platform_id"] == 0].\
            replace({"store_id": COUNTRY_CODE_MAPPING["google-play"]})
        df.loc[df["platform_id"] == 1] = df.loc[df["platform_id"] == 1].\
            replace({"store_id": COUNTRY_CODE_MAPPING["apple-store"]})
        df = df.rename(columns={'store_id': 'country_code'})

        # category_id mapping
        df.loc[df["platform_id"] == 0] = df.loc[df["platform_id"] == 0].\
            replace({"category_id": CATEGORY_ID_MAPPING["google-play"]["android-all"]})
        df.loc[df["platform_id"] == 1] = df.loc[df["platform_id"] == 1].\
            replace({"category_id": CATEGORY_ID_MAPPING["apple-store"]["ios-all"]})

        # device_code mapping
        df["device_code"] = df["platform_id"].astype(str) + df["feed"].astype(str)
        df = df.replace({"device_code": self.device_code_mapping})

        # granularity
        df["granularity"] = "daily"

        # metrics mapping (from feed)
        df = df.replace({"feed": self.metric_mapping})
        return df

    def _parse_unified_format(self, df):
        df = df.rename(columns=self.dimension_mapping)
        df = df.pivot_table(index=["publisher_id", "category_id", "device_code", "country_code", "granularity"],
                            columns='feed', values='est')
        df.reset_index(inplace=True)
        df.columns.name = None
        return df

    def _get_raw_data_by_date_country(self, date, country_code):
        """
        +----------+--------+-----------+-----------+--------+----+----+-----+--------+
        |        id|store_id|category_id|platform_id|vertical|rank|feed|  est|platform|
        +----------+--------+-----------+-----------+--------+----+----+-----+--------+
        | 284417353|       0|       6006|          1|       1|   1|1002|45235|     ios|
        | 349554266|       0|       6006|          1|       1|   2|1002|20732|     ios|
        |1316153435|       0|       6006|          1|       1|   3|1002|15136|     ios|
        +----------+--------+-----------+-----------+--------+----+----+-----+--------+
        """
        ios_store_ids = [str(k) for k, v in COUNTRY_CODE_MAPPING["apple-store"].items() if v == country_code]
        gp_store_ids = [str(k) for k, v in COUNTRY_CODE_MAPPING["google-play"].items() if v == country_code]
        raw_df = self.spark.read.parquet(self.raw_s3_path.format(date)).\
            filter('store_id in ({})'.format(",".join(ios_store_ids + gp_store_ids))).toPandas()
        return raw_df

    def get_metrics_count(self, date):
        ios_store_id = [str(s_id) for s_id in COUNTRY_CODE_MAPPING["apple-store"].keys()]
        gp_store_id = [str(s_id) for s_id in COUNTRY_CODE_MAPPING["google-play"].keys()]
        # ios_category_id = [str(c_id) for c_id in CATEGORY_ID_MAPPING["apple-store"]["ios-all"].keys()]
        # gp_category_id = [str(c_id) for c_id in CATEGORY_ID_MAPPING["google-play"]["android-all"].keys()]

        fillter_sql = "platform_id = {} and store_id in ({})"
        df = self.spark.read.parquet(self.raw_s3_path.format(date))
        count_all = df.filter(fillter_sql.format(1, ",".join(ios_store_id))).count() + \
                    df.filter(fillter_sql.format(0, ",".join(gp_store_id))).count()
        print fillter_sql.format(1, ",".join(ios_store_id))
        print fillter_sql.format(0, ",".join(gp_store_id))
        return count_all

    def get_v1_raw_metrics_count(self, date):
        ios_store_id = [str(s_id) for s_id in COUNTRY_CODE_MAPPING["apple-store"].keys()]
        gp_store_id = [str(s_id) for s_id in COUNTRY_CODE_MAPPING["google-play"].keys()]

        df = self.spark.read.option("delimiter", "\t").csv(
            "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{}/*/sbe_est_publisher/*/*".format(date))
        fillter_sql = "_c2 = {} and _c0 in ({})"  # _c2 > platform_id, _c0 > store_id

        count_all = df.filter(fillter_sql.format(1, ",".join(ios_store_id))).count() + \
                    df.filter(fillter_sql.format(0, ",".join(gp_store_id))).count()
                    
        return count_all


class PublisherEstUnifiedData(object):
    unified_s3_path = "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-publisher.v1/fact/" \
                      "granularity=daily/date={}/"

    def __init__(self, spark):
        self.spark = spark

    def get(self, date, country_code):
        unified_df = self.spark.read.format("delta").\
            load(self.unified_s3_path.format(date)).filter("country_code = '{}'".format(country_code)).toPandas()
        unified_df = unified_df.drop(["_identifier", "revenue_iap", "revenue_non_iap", "date"], axis=1)
        return unified_df

    def get_metrics_count(self, date):
        df = self.spark.read.format("delta").load(self.unified_s3_path.format(date))
        metrics_count = 0
        for metric in ["free_app_download", "paid_app_download", "revenue"]:
            metrics_count += df.filter("{} is not null".format(metric)).count()
        return metrics_count



class PublisherEstDBData(object):
    def get(self, date):
        sql = "SELECT * FROM store.store_publisher_est_fact_v1 WHERE date='{}'".format(date)
        return query_df(MARKET_SIZE_DSN, sql)

    def get_led(self):
        sql = "SELECT * FROM store.store_publisher_est_latest_date_v1"
        return query_df(MARKET_SIZE_DSN, sql)


class TestPublisherEstWeekly(PipelineTest):
    # Every Tuesday 16:00 UTC time will refresh the data of last Full Week.
    trigger_date_config = ('* 16 * * 2', 3)

    def _compare_df(self, df1, df2, log=''):
        for diff_type in ["left", "right"]:
            diff_df = df1.merge(df2, indicator=True, how=diff_type)  # .loc[lambda x : x['_merge']!='both']
            diff_df = diff_df.loc[diff_df["_merge"] != "both"]
            if len(diff_df) != 0:
                print diff_type
                print "dataframe overview of df1 and df2"
                print df1
                print df2
                print "dimension overview of diff df"
                print diff_df.country_code.unique()
                print diff_df.category_id.unique()
                print diff_df.device_code.unique()
            self.assertEqual(len(diff_df), 0,
                             msg="found mismatch when compare the raw, unified, db."
                                 " diff count is \n {}, logs:{}".format(len(diff_df), log))

    @etl_skip()
    def test_publisher_est_etl_accuracy(self):
        # Every Tuesday 16:00 UTC time will refresh the data of last Full Week.
        country_code = 'US'
        start_date, end_date = get_week_start_end_date(self.check_date_str)
        date_list = get_date_list(start_date, end_date)
        for date in date_list:
            raw_df = PublisherEstRawData(self.spark).get(date, country_code)
            unified_df = PublisherEstUnifiedData(self.spark).get(date, country_code)
            # db_df = PublisherEstDBData().get(date)

            self._compare_df(raw_df, unified_df, log="raw / unified - {}".format(date))
            # self._compare_df(unified_df, db_df, log="unified / db - {}".format(date))

    @etl_skip()
    def test_publisher_Est_etl_completeness(self):
        start_date, end_date = get_week_start_end_date(self.check_date_str)
        date_list = get_date_list(start_date, end_date)
        for date in date_list:
            raw_count = PublisherEstRawData(self.spark).get(date)
            unified_count = PublisherEstUnifiedData(self.spark).get(date)
            self.assertEqual(raw_count, unified_count)

    def test_publisher_est_etl_timelines(self):
        # Every Tuesday 16:00 UTC time will refresh the data of last Full Week.
        # E.g. 2020-02-11 17:00 the data of 2020-02-02 ~ 2020-02-08 will be ready
        trigger_datetime = datetime.datetime.strptime("2020-02-11 17:00:00", '%Y-%m-%d %H:%M:%S')
        check_date_str_actual = self._get_check_date_from_routing_config(trigger_datetime).strftime("%Y-%m-%d")
        self.assertEqual("2020-02-08", check_date_str_actual)



In [0]:


begin_date = datetime.datetime(2018, 07, 04)
end_date = datetime.datetime(2019, 07, 13)

date_list = get_date_list(begin_date, end_date, "D")

date_list = ["2013-03-24"]
log = ""

for date in date_list:
    # count1 = PublisherEstRawData(spark).get_metrics_count(date)
    # count1 = PublisherEstRawData(spark).get_v1_raw_metrics_count(date)
    
    count2 = PublisherEstUnifiedData(spark).get_metrics_count(date)
    log = log + "{}, {}, {}, {} \n".format(date, count1, count2, count1-count2)
    print log
    write_log(log, s3object)


In [0]:


ios_store_id = [str(s_id) for s_id in COUNTRY_CODE_MAPPING["apple-store"].keys()]
gp_store_id = [str(s_id) for s_id in COUNTRY_CODE_MAPPING["google-play"].keys()]
# ios_category_id = [str(c_id) for c_id in CATEGORY_ID_MAPPING["apple-store"]["ios-all"].keys()]
# gp_category_id = [str(c_id) for c_id in CATEGORY_ID_MAPPING["google-play"]["android-all"].keys()]

fillter_sql = "platform_id = {} and store_id in ({})"

print fillter_sql.format(1, ",".join(ios_store_id))
print fillter_sql.format(0, ",".join(gp_store_id))


In [0]:
%%sh

aws s3 cp s3://b2c-prod-data-pipeline-qa/tom/top_publisher/regression5.txt /tmp/regression5.txt
cat /tmp/regression5.txt


In [0]:
%%sh
