In [0]:

# Copyright (c) 2018 App Annie Inc. All rights reserved.
# pylint: disable=E1101,C0412

"""
DB Check modules
"""

import zlib

import pandas as pd
import psycopg2
from pyspark.sql import functions as F
from pandas.io import sql as sqlio

from applications.db_check_v1.cases.store.app_rank_v1.constants import APP_STORE_RANK_METRICS, rank_bucket, \
    CATEGORY_ID_MAPPING, METRIC_MAPPING, COUNTRY_CODE_MAPPING, aa_dsn, aa_amazon_dsn, citus_dsn
from applications.db_check_v1.common.base_test import PipelineTest
from applications.db_check_v1.common.db_check_utils import query_df


class AppStoreRankRawData(object):
    bucket_name = ""
    bucket_path = ""
    data_split_str = ""
    rank_list_split_str = ""
    rank_split_str = ""
    accept_feeds = []
    country_code_mapping = {}
    category_id_mapping = {}
    metric_mapping = {}
    market_code = ""
    filename_available = []

    def __init__(self, spark):
        self.spark = spark

    def get(self, date, country_code, category_id):
        df = self._get_raw_data_by_date(date)
        df = df.loc[(df.country_code == country_code) & (df.category_id == category_id)]
        return self._parse_unified_format(df)

    def _parse_mapping(self, df):
        if self.country_code_mapping:
            if isinstance(self.country_code_mapping.keys()[0], int):
                df["country_code"] = pd.to_numeric(df["country_code"])
            df = df.replace({"country_code": self.country_code_mapping})

        if self.category_id_mapping:
            df["category_id"] = pd.to_numeric(df["category_id"])
            df = df.replace({"category_id": self.category_id_mapping})

        if self.metric_mapping:
            df["metric"] = pd.to_numeric(df["metric"])
            df = df.replace({"metric": self.metric_mapping})
        return df

    def _parse_unified_format(self, df):
        columns = ["code", "category_id", "country_code",
                   "free_download", "paid_download", "revenue", "new_free_download", "new_paid_download"]
        data_list = []
        for index, row in df.iterrows():
            if not row.app_rank_list:
                continue
            for index, app_id in enumerate(row.app_rank_list, start=1):
                free_download = index if row.metric == 'free_download' else None
                paid_download = index if row.metric == 'paid_download' else None
                revenue = index if row.metric == 'revenue' else None
                new_free_download = index if row.metric == 'new_free_download' else None
                new_paid_download = index if row.metric == 'new_paid_download' else None
                data_list.append([app_id, row.category_id, row.country_code,
                                  free_download, paid_download, revenue, new_free_download, new_paid_download])
        new_df = pd.DataFrame(data_list, columns=columns)
        aggregate_dict = {metric: 'min' for metric in APP_STORE_RANK_METRICS}
        new_df = new_df.groupby(new_df['code'], as_index=False).aggregate(aggregate_dict). \
            reindex(columns=new_df.columns)
        # code to app_ids
        code_app_id_mapping = self._get_code_app_id_mapping(new_df.code.tolist())
        if code_app_id_mapping is not None:
            new_df = new_df.merge(code_app_id_mapping, on='code', how="left").rename(columns={'id': 'app_id'})
        else:
            new_df = new_df.rename(columns={'code': 'app_id'})
        return new_df

    def _get_raw_data_by_date(self, date):
        """
        :return: raw_data_frame
        :rtype: list_of_dic
        raw_data:
        _________________________________________________________________________________
        |    date    |   country_id   |  category_id  |   feed_id   |   rank (app_id)   |
        |------------|----------------|---------------|-------------|-------------------|
        | 2019-04-27 | 143441(bigint) |   6016 (int)  |   0 (int)   | 376510438(bigint) |
        ---------------------------------------------------------------------------------
        unified_data:
        _____________________________________________________________________________________
        |  country_code  |   category_id   |       app_id       | feed_name (free_download) |
        |----------------|-----------------|--------------------|---------------------------|
        |      'US'      | 100026 (bigint) | 376510438 (bigint) |    25 (int) (app_rank)    |
        -------------------------------------------------------------------------------------
        """
        path = "{_bucket_path}/{_date}/23/".format(_date=date, _bucket_path=self.bucket_path)
        bucket = rank_bucket(self.bucket_name)
        columns = ['date', 'country_code', 'category_id', 'metric', 'app_rank_list']

        _raw_data_list = []
        for filepath in bucket.list(path):
            filename = filepath.replace(path, '')
            if self.filename_available and filename not in self.filename_available:
                continue
            _raw_data = zlib.decompress(bucket.get(filepath))
            for _line in _raw_data.splitlines():
                line_data = _line.split(self.data_split_str)
                app_rank_list = [rank_app for rank_app in line_data[4].split(self.rank_list_split_str) if
                                 rank_app.strip() != '']
                if self.rank_split_str:
                    app_rank_list = [app_rank.split(self.rank_split_str)[1] for app_rank in app_rank_list]
                line_data[4] = app_rank_list
                _raw_data_list.append(line_data)
        return self._parse_mapping(pd.DataFrame(_raw_data_list, columns=columns))

    def _get_code_app_id_mapping(self, code_list):
        mapping_df = pd.DataFrame([[int(code), code] for code in code_list], columns=["id", "code"])
        return mapping_df

    def get_rank_count_and_sum_by_date(self, date):
        raw_df = self._get_raw_data_by_date(date)
        raw_agg_df_list = []
        for _, row in raw_df.iterrows():
            if not row.app_rank_list:
                continue
            _df = pd.DataFrame([[app_id, app_index] for app_index, app_id in enumerate(row.app_rank_list, start=1)],
                               columns=["code", "rank"])
            aggregation_functions = {'rank': 'min'}
            raw_agg_df = _df.groupby(_df['code'], as_index=False) \
                .aggregate(aggregation_functions) \
                .reindex(columns=_df.columns)
            raw_agg_df_list.append(raw_agg_df)

        raw_agg_df_all = pd.concat(raw_agg_df_list, ignore_index=True, sort=False)
        app_id_mapping_df = self._get_code_app_id_mapping(list(set(raw_agg_df_all.code.tolist())))
        if app_id_mapping_df is not None:
            raw_agg_df_all = raw_agg_df_all.merge(app_id_mapping_df, on='code', how="left")
        raw_count = len(raw_agg_df_all[raw_agg_df_all.id.notnull()])
        raw_sum = raw_agg_df_all[raw_agg_df_all.id.notnull()]["rank"].sum()
        return raw_count, raw_sum


class MacRaw(AppStoreRankRawData):
    bucket_name = "prod_appannie_ios"
    bucket_path = "mac/country-ranks"
    data_split_str = "\t"
    rank_list_split_str = " "

    device_code = "mac-os-mac"
    market_code = 'apple-store'

    category_id_mapping = CATEGORY_ID_MAPPING['mac-os-mac']
    metric_mapping = METRIC_MAPPING['mac-os-mac']
    country_code_mapping = COUNTRY_CODE_MAPPING['ios']

    filename_available = [str(id) for id in COUNTRY_CODE_MAPPING['ios']]

# class IPhoneRaw(AppStoreRankRawData):
#     bucket_name = "prod_appannie_ios"
#     bucket_path = "country-ranks"
#     data_split_str = "\t"
#     rank_list_split_str = " "
#     accept_feeds = [0, 1, 2]
#
#
# class IPadRaw(AppStoreRankRawData):
#     bucket_name = "prod_appannie_ios"
#     bucket_path = "country-ranks"
#     data_split_str = "\t"
#     rank_list_split_str = " "


class AppleTvRaw(AppStoreRankRawData):
    bucket_name = "prod_appannie_appletv"
    bucket_path = "country-ranks"
    data_split_str = ","
    rank_list_split_str = " "
    rank_split_str = "-"
    country_code_mapping = None
    category_id_mapping = CATEGORY_ID_MAPPING['tv-os-tv']
    metric_mapping = METRIC_MAPPING['tv-os-tv']

    device_code = "tv-os-tv"
    market_code = 'apple-store'
    filename_available = [COUNTRY_CODE_MAPPING['ios'][id] for id in COUNTRY_CODE_MAPPING['ios']]

    # this should be a product_service
    def _get_code_app_id_mapping(self, code_list):
        sql_code_list = "','".join(code_list)
        sql = "select id,code from product where code in ('{}')".format(sql_code_list)
        conn = psycopg2.connect(aa_dsn)
        df = sqlio.read_sql_query(sql, conn)
        return df


class AmazonRaw(AppStoreRankRawData):
    bucket_name = "prod_appannie_amazon"
    bucket_path = "country-ranks"
    data_split_str = ","
    rank_list_split_str = " "
    rank_split_str = "-"
    device_code = "android-all"
    market_code = 'amazon-store'

    filename_available = ['FR', 'CN', 'CA', 'DE', 'JP', 'IT', 'US', 'UK', 'ES']

    country_code_mapping = COUNTRY_CODE_MAPPING[market_code][device_code]
    category_id_mapping = CATEGORY_ID_MAPPING[market_code][device_code]
    metric_mapping = METRIC_MAPPING[market_code][device_code]

    # this should be a product_service
    def _get_code_app_id_mapping(self, code_list):
        sql_code_list = "','".join(code_list)
        sql = "select id,code from app where code in ('{}')".format(sql_code_list)
        conn = psycopg2.connect(aa_amazon_dsn)
        df = sqlio.read_sql_query(sql, conn)
        return df


class AppStoreRankUnifiedData(object):
    device_code = ''
    s3_path = ""
    category_id_mapping = {}

    def __init__(self, spark):
        self.spark = spark

    def get(self, date, country_code, category_id):
        filter_str = "country_code='{country_code}' and category_id='{category_id}'".format(
            country_code=country_code, category_id=category_id)
        df = self._get_unified_df_by_date(date, filter_str)
        return df.toPandas()

    def _get_unified_df_by_date(self, date, filter_str):
        df = self.spark.read.parquet("{}date={}/device_code={}/".format(self.s3_path, date, self.device_code))
        if filter_str:
            df = df.filter(filter_str)
        return df

    def get_rank_count_and_sum_by_date(self, date, filter_str=None):
        df = self._get_unified_df_by_date(date, filter_str)
        df_agg = df.filter('app_id is not null').agg(
            F.count("free_download").alias("free_download_count"),
            F.count("paid_download").alias("paid_download_count"),
            F.count("revenue").alias("revenue_count"),
            F.count("new_free_download").alias("new_free_download_count"),
            F.count("new_paid_download").alias("new_paid_download_count"),
            F.sum("free_download").alias("free_download_sum"),
            F.sum("paid_download").alias("paid_download_sum"),
            F.sum("revenue").alias("revenue_sum"),
            F.sum("new_free_download").alias("new_free_download_sum"),
            F.sum("new_paid_download").alias("new_paid_download_sum")
        ).collect()

        data = df_agg[0]
        unified_count = sum([data.free_download_count, data.paid_download_count, data.revenue_count,
                             data.new_free_download_count, data.new_paid_download_count])
        unified_sum = sum([data.free_download_sum or 0, data.paid_download_sum or 0, data.revenue_sum or 0,
                           data.new_free_download_sum or 0, data.new_paid_download_sum or 0])
        return unified_count, unified_sum


class AppleTvUnified(AppStoreRankUnifiedData):
    device_code = 'tv-os-tv'
    s3_path = "s3://b2c-prod-data-pipeline-unified-store-free/unified/app-tech.store.app-rank.v1/fact/"


class MacUnified(AppStoreRankUnifiedData):
    device_code = 'mac-os-mac'
    s3_path = "s3://b2c-prod-data-pipeline-unified-store-free/unified/app-tech.store.app-rank.v1/fact/"


class AmazonUnified(AppStoreRankUnifiedData):
    device_code = 'android-all'
    s3_path = "s3://b2c-prod-data-pipeline-unified-store-free/unified/app-tech.store.app-rank.v2/fact/"
    market_code = 'amazon-store'
    category_id_mapping = CATEGORY_ID_MAPPING[market_code][device_code]


class AppStoreRankDBData(object):
    schema = "store"
    table = "store_app_rank_fact_v1"
    device_code = ''
    category_id_mapping = {}

    def __init__(self, spark):
        self.spark = spark

    def get(self, date, country_code, category_id):
        sql = "SELECT * FROM {schema}.{table} WHERE date ='{date}' AND device_code='{device_code}' AND " \
              "country_code='{country_code}' AND category_id='{category_id}'" \
            .format(schema=self.schema, table=self.table, date=date, device_code=self.device_code,
                    country_code=country_code, category_id=category_id)
        result = query_df(citus_dsn, sql)
        return result

    def get_rank_count_and_sum_by_date(self, date):
        sql_select = ["sum({metric}) AS {metric}_sum ,count({metric}) AS {metric}_count".format(
            metric=metric) for metric in APP_STORE_RANK_METRICS]
        sql = "SELECT {sql_select} FROM {schema}.{table} WHERE date ='{date}' AND device_code='{device_code}' AND " \
              "category_id BETWEEN {category_id_min} AND {category_id_max}" \
            .format(sql_select=",".join(sql_select), schema=self.schema, table=self.table,
                    date=date, device_code=self.device_code,
                    category_id_min=min(self.category_id_mapping.values()),
                    category_id_max=max(self.category_id_mapping.values()))
        data = query_df(citus_dsn, sql).iloc[0]
        db_count = sum([data.free_download_count, data.paid_download_count, data.revenue_count,
                        data.new_free_download_count, data.new_paid_download_count])
        db_sum = sum([data.free_download_sum or 0, data.paid_download_sum or 0, data.revenue_sum or 0,
                      data.new_free_download_sum or 0, data.new_paid_download_sum or 0])
        return db_count, db_sum


class AppleTvDB(AppStoreRankDBData):
    device_code = 'tv-os-tv'
    category_id_mapping = CATEGORY_ID_MAPPING[device_code]


class AmazonDB(AppStoreRankDBData):
    device_code = 'android-all'
    market_code = 'amazon-store'
    category_id_mapping = CATEGORY_ID_MAPPING[market_code][device_code]


class MacDB(AppStoreRankDBData):
    device_code = 'mac-os-mac'
    category_id_mapping = CATEGORY_ID_MAPPING[device_code]


class TestAppStoreRankDaily(PipelineTest):
    routing_config = ('* 9 * * *', 1)

    def _check_app_list_equal_between_data_source(self, raw_df, unified_df, db_df):
        for metric in APP_STORE_RANK_METRICS:
            _raw_df = raw_df[pd.notnull(raw_df[metric])].sort_values(metric)
            _unified_df = unified_df[pd.notnull(unified_df[metric])].sort_values(metric)
            _db_df = db_df[pd.notnull(db_df[metric])].sort_values(metric)

            data_app = {
                'raw': _raw_df.app_id.tolist(),
                'unified': _unified_df.app_id.tolist(),
                'db': _db_df.app_id.tolist()
            }

            data_rank = {
                'raw': _raw_df[metric].tolist(),
                'unified': _unified_df[metric].tolist(),
                'db': _db_df[metric].tolist()
            }

            self.assertTrue(data_app['raw'] == data_app['unified'] == data_app['db'],
                            msg="App list not equal: {}".format(data_app))
            self.assertTrue(data_rank['raw'] == data_rank['unified'] == data_rank['db'],
                            msg="Rank list not equal: {}".format(data_rank))

    def _check_rank_count_equal_between_data_source(self, raw_count, unified_count, db_count):
        rank_count = {
            'raw': raw_count,
            'unified': unified_count,
            'db': db_count,
        }
        self.assertTrue(rank_count['raw'] == rank_count['unified'] == rank_count['db'],
                        msg="Rank count not equal: {}".format(rank_count))

    def _check_rank_sum_equal_between_data_source(self, raw_sum, unified_sum, db_sum):
        rank_sum = {
            'raw': raw_sum,
            'unified': unified_sum,
            'db': db_sum
        }
        self.assertTrue(rank_sum['raw'] == rank_sum['unified'] == rank_sum['db'],
                        msg="Rank sum not equal: {}".format(rank_sum))

    def test_apple_tv_etl_accuracy(self):
        country_code = 'US'
        category_id = 300000

        raw_df = AppleTvRaw(self.spark).get(self.check_date_str, country_code, category_id)
        unified_df = AppleTvUnified(self.spark).get(self.check_date_str, country_code, category_id)
        db_df = AppleTvDB(self.spark).get(self.check_date_str, country_code, category_id)

        self._check_app_list_equal_between_data_source(raw_df, unified_df, db_df)

    def test_amazon_etl_accuracy(self):
        country_code = 'US'
        category_id = 720000

        raw_df = AmazonRaw(self.spark).get(self.check_date_str, country_code, category_id)
        unified_df = AmazonUnified(self.spark).get(self.check_date_str, country_code, category_id)
        db_df = AmazonDB(self.spark).get(self.check_date_str, country_code, category_id)

        self._check_app_list_equal_between_data_source(raw_df, unified_df, db_df)

    def test_mac_etl_accuracy(self):
        country_code = 'US'
        category_id = 200000

        raw_df = MacRaw(self.spark).get(self.check_date_str, country_code, category_id)
        unified_df = MacUnified(self.spark).get(self.check_date_str, country_code, category_id)
        db_df = MacDB(self.spark).get(self.check_date_str, country_code, category_id)

        self._check_app_list_equal_between_data_source(raw_df, unified_df, db_df)

    def test_amazon_etl_completeness(self):
        raw_count, raw_sum = AmazonRaw(self.spark).get_rank_count_and_sum_by_date(self.check_date_str)
        unified_count, unified_sum = AmazonUnified(self.spark).get_rank_count_and_sum_by_date(self.check_date_str)
        db_count, db_sum = AmazonDB(self.spark).get_rank_count_and_sum_by_date(self.check_date_str)

        # filtered_unified_count/sum is the count and sum which category_id not in 700000 ~ 799999
        filter_str = "category_id NOT BETWEEN 700000 AND 799999"
        filtered_unified_count, filterd_unified_sum = AmazonUnified(self.spark).get_rank_count_and_sum_by_date(
            self.check_date_str, filter_str)

        # filtered data(category not in 700000 ~ 799999) will not be load to db during ETL,
        # so here plus the filtered count/sum into db count/sum
        self._check_rank_count_equal_between_data_source(raw_count, unified_count, db_count + filtered_unified_count)
        self._check_rank_sum_equal_between_data_source(raw_sum, unified_sum, db_sum + filterd_unified_sum)

    def test_apple_tv_etl_completeness(self):
        raw_count, raw_sum = AppleTvRaw(self.spark).get_rank_count_and_sum_by_date(self.check_date_str)
        unified_count, unified_sum = AppleTvUnified(self.spark).get_rank_count_and_sum_by_date(self.check_date_str)
        db_count, db_sum = AppleTvDB(self.spark).get_rank_count_and_sum_by_date(self.check_date_str)

        self._check_rank_count_equal_between_data_source(raw_count, unified_count, db_count)
        self._check_rank_sum_equal_between_data_source(raw_sum, unified_sum, db_sum)

    def test_mac_etl_completeness(self):
        raw_count, raw_sum = MacRaw(self.spark).get_rank_count_and_sum_by_date(self.check_date_str)
        unified_count, unified_sum = MacUnified(self.spark).get_rank_count_and_sum_by_date(self.check_date_str)
        db_count, db_sum = MacDB(self.spark).get_rank_count_and_sum_by_date(self.check_date_str)

        self._check_rank_count_equal_between_data_source(raw_count, unified_count, db_count)
        self._check_rank_sum_equal_between_data_source(raw_sum, unified_sum, db_sum)




In [0]:


import sys
import datetime
import traceback
import unittest

# from applications.db_check_v1.cases.store.app_rank_v1.test_app_rank_v1 import TestAppStoreRankDaily


def debug(case_list):
    std_out_origin= sys.stdout
    std_err_origin= sys.stderr
    try:
        suite =  unittest.TestSuite()
        for case in case_list:
            suite.addTest(case)
        runner = unittest.TextTestRunner(verbosity=2, buffer=True)
        runner.run(suite)
    except Exception as ex:
        print dir(ex)
        print ex.message
        traceback.print_exception(type(ex), ex, ex.__traceback__)
    finally:
        sys.stdout = std_out_origin
        sys.stderr = std_err_origin
    
case_name_list = [
    "test_amazon_etl_accuracy",
    "test_amazon_etl_completeness",
    
    "test_apple_tv_etl_accuracy",
    "test_apple_tv_etl_completeness",
    
    "test_mac_etl_accuracy",
    "test_mac_etl_completeness",
    ]

test_case_list = [ TestAppStoreRankDaily(name, datetime.datetime.strptime("2020-01-01", "%Y-%m-%d") ) for name in case_name_list ] # pass 

debug(test_case_list)


In [0]:
 


from bdce.common.utils import update_application_code
update_application_code(
    spark, role="BDP-PROD-APP-INT-QA", application_name="aa-int-qa-db-check-debug"
)
spark.sparkContext.addPyFile("/tmp/zeppelin_application_code/libs/python/dependencies.zip")


In [0]:


spark.sparkContext.addPyFile("/home/hadoop/bdp/application/libs/python/dependencies.zip")


In [0]:
%%sh

ls -al /tmp/zeppelin_application_code
echo 123123123
ls -al /home/hadoop/bdp/application/


In [0]:
%%sh
