In [0]:

from bdce.common.utils import update_application_code
update_application_code(
    spark, role="BDP-PROD-APP-INT-QA", application_name="aa-int-qa-db-check-debug"
)



In [0]:

# restart interpreter


In [0]:

spark.sparkContext.addPyFile("/tmp/zeppelin_application_code/libs/python/dependencies.zip")

from applications.db_check_v1.db_check import send_message

send_message()



In [0]:

# Copyright (c) 2018 App Annie Inc. All rights reserved.

"""
DB Check modules
"""

# spark.sparkContext.addPyFile("/tmp/zeppelin_application_code/libs/python/dependencies.zip")

import zlib

import pandas as pd

from applications.db_check_v1.cases.store.app_rank_v1.base_test import PipelineTest
from applications.db_check_v1.cases.store.app_rank_v1.constants import APP_STORE_RANK_METRICS
from applications.db_check_v1.cases.store.app_rank_v1.reader import AppleTvRaw, AmazonRaw, AppleTvUnified, \
    AmazonUnified, AppleTvDB, AmazonDB, MacDB, MacUnified, MacRaw
    
    
from applications.db_check_v1.cases.store.app_rank_v1.reader import AppStoreRankRawData
from applications.db_check_v1.cases.store.app_rank_v1.constants import APP_STORE_RANK_METRICS, rank_bucket, \
    CATEGORY_ID_MAPPING, METRIC_MAPPING, COUNTRY_CODE_MAPPING, aa_dsn, aa_amazon_dsn, citus_dsn


class TestAppStoreRankDaily(PipelineTest):
    routing_config = ('* 9 * * *', 1)

    def test_mac_etl_process(self):
        country_code = 'US'
        category_id = 200000
        raw_df = MacRaw(self.spark).parse_df_to_unified_format(
            MacRaw(self.spark).get(self.check_date_str, country_code, category_id))
        unified_df = MacUnified(self.spark).get(self.check_date_str, country_code, category_id)
        db_df = MacDB(self.spark).get(self.check_date_str, country_code, category_id)
        for metric in APP_STORE_RANK_METRICS:
            _raw_df = raw_df[pd.notnull(raw_df[metric])].sort_values(metric)
            _unified_df = unified_df[pd.notnull(unified_df[metric])].sort_values(metric)
            _db_df = db_df[pd.notnull(db_df[metric])].sort_values(metric)
            print _raw_df
            print _unified_df
            print _db_df
            data_app = {
                'raw': _raw_df.app_id.tolist(),
                'unified': _unified_df.app_id.tolist(),
                'db': _db_df.app_id.tolist()
            }

            data_rank = {
                'raw': _raw_df[metric].tolist(),
                'unified': _unified_df[metric].tolist(),
                'db': _db_df[metric].tolist()
            }

            self.assertTrue(data_app['raw'] == data_app['unified'] == data_app['db'], msg="{}".format(data_app))
            self.assertTrue(data_rank['raw'] == data_rank['unified'] == data_rank['db'], msg="{}".format(data_rank))

    def test_mac_transform_completeness(self):
        raw_count, raw_sum = MacRaw(self.spark).get_rank_count_and_sum_by_date(self.check_date_str)
        unified_count, unified_sum = MacUnified(self.spark).get_rank_count_and_sum_by_date(self.check_date_str)

        self.assertEqual(raw_count, unified_count)
        self.assertEqual(raw_sum, unified_sum)




In [0]:


import sys
import datetime
import traceback
import unittest

def debug(case):
    std_out_origin= sys.stdout
    std_err_origin= sys.stderr
    try:
        suite =  unittest.TestSuite()
        suite.addTest(case)
        runner = unittest.TextTestRunner(verbosity=2, buffer=True)
        runner.run(suite)
    except Exception as ex:
        print ex.message
        print traceback.format_exc()
    finally:
        sys.stdout = std_out_origin
        sys.stderr = std_err_origin
    

# testcase = AppStoreRankDailyTest("test_amazon_completeness", datetime.datetime.strptime("2019-06-07", "%Y-%m-%d") )  # pass 
# testcase = AppStoreRankDailyTest("test_apple_tv_completeness", datetime.datetime.strptime("2019-06-07", "%Y-%m-%d") )  # pass
# testcase = AppStoreRankDailyTest("test_amazon_etl_process", datetime.datetime.strptime("2019-06-07", "%Y-%m-%d") )  # pass
testcase = TestAppStoreRankDaily("test_mac_etl_process", datetime.datetime.strptime("2019-06-07", "%Y-%m-%d") )  # pass
debug(testcase)



In [0]:


import zlib


class AppStoreRankRawData():
    bucket_name = ""
    bucket_path = ""
    data_split_str = ""
    rank_list_split_str = ""
    rank_split_str = ""
    accept_feeds = []
    country_code_mapping = {}
    category_id_mapping = {}
    metric_mapping = {}
    market_code = ""
    filename_available = []

    def __init__(self, spark):
        self.spark = spark

    def get(self, date, country_code, category_id):
        df = self.get_raw_data_by_date(date)
        return df.loc[(df.country_code == country_code) & (df.category_id == category_id)]

    def parse_mapping(self, df):
        if self.country_code_mapping:
            if isinstance(self.country_code_mapping.keys()[0], int):
                df["country_code"] = pd.to_numeric(df["country_code"])
            df = df.replace({"country_code": self.country_code_mapping})

        if self.category_id_mapping:
            df["category_id"] = pd.to_numeric(df["category_id"])
            df = df.replace({"category_id": self.category_id_mapping})

        if self.metric_mapping:
            df["metric"] = pd.to_numeric(df["metric"])
            df = df.replace({"metric": self.metric_mapping})
        return df

    def parse_df_to_unified_format(self, df):
        columns = ["code", "category_id", "country_code",
                   "free_download", "paid_download", "revenue", "new_free_download", "new_paid_download" ]
        data_list = []
        for index,row in df.iterrows():
            if len(row.app_rank_list) == 0:
                continue
            for index,app_id in enumerate(row.app_rank_list, start=1):
                free_download = index if row.metric == 'free_download' else None
                paid_download = index if row.metric == 'paid_download' else None
                revenue = index if row.metric == 'revenue' else None
                new_free_download = index if row.metric == 'new_free_download' else None
                new_paid_download = index if row.metric == 'new_paid_download' else None
                data_list.append([app_id, row.category_id, row.country_code,
                                  free_download, paid_download, revenue, new_free_download, new_paid_download])
        new_df = pd.DataFrame(data_list, columns=columns)
        aggregate_dict = {metric: 'min' for metric in APP_STORE_RANK_METRICS}
        new_df = new_df.groupby(new_df['code'], as_index=False).aggregate(aggregate_dict).\
            reindex(columns=new_df.columns)
        # code to app_ids
        code_app_id_mapping = self.get_code_app_id_mapping(new_df.code.tolist())
        if code_app_id_mapping is not None:
            new_df = new_df.merge(code_app_id_mapping, on='code', how="left").rename(columns={'id':'app_id'})
        else:
            new_df.rename(columns={'code': 'app_id'})
        return new_df

    def get_unified_format(self, date):
        df = self.get_raw_data_by_date(date)
        return self.parse_df_to_unified_format(df)

    def get_raw_data_by_date(self, date):
        """
        :return: raw_data_frame
        :rtype: list_of_dic
        raw_data:
        _________________________________________________________________________________
        |    date    |   country_id   |  category_id  |   feed_id   |   rank (app_id)   |
        |------------|----------------|---------------|-------------|-------------------|
        | 2019-04-27 | 143441(bigint) |   6016 (int)  |   0 (int)   | 376510438(bigint) |
        ---------------------------------------------------------------------------------
        unified_data:
        _____________________________________________________________________________________
        |  country_code  |   category_id   |       app_id       | feed_name (free_download) |
        |----------------|-----------------|--------------------|---------------------------|
        |      'US'      | 100026 (bigint) | 376510438 (bigint) |    25 (int) (app_rank)    |
        -------------------------------------------------------------------------------------
        """
        path = "{_bucket_path}/{_date}/23/".format(_date=date, _bucket_path=self.bucket_path)
        bucket = rank_bucket(self.bucket_name)
        columns = ['date', 'country_code', 'category_id', 'metric', 'app_rank_list']

        _raw_data_list = []
        for filepath in bucket.list(path):
            filename = filepath.replace(path, '')
            if self.filename_available and filename not in self.filename_available:
                continue
            _raw_data = zlib.decompress(bucket.get(filepath))
            for _line in _raw_data.splitlines():
                line_data = _line.split(self.data_split_str)
                app_rank_list = [rank_app for rank_app in line_data[4].split(self.rank_list_split_str) if
                                 rank_app.strip() != '']
                if self.rank_split_str:
                    app_rank_list = [app_rank.split(self.rank_split_str)[1] for app_rank in app_rank_list]
                line_data[4] = app_rank_list
                _raw_data_list.append(line_data)
        return self.parse_mapping(pd.DataFrame(_raw_data_list, columns=columns))


    def get_code_app_id_mapping(self, code_ids):
        return None

    def get_rank_count_and_sum_by_date(self, date):
        raw_df = self.get_raw_data_by_date(date)
        raw_agg_df_list = []
        for df_index, row in raw_df.iterrows():
            if len(row.app_rank_list) == 0:
                continue
            _df = pd.DataFrame([[app_id, app_index] for app_index, app_id in enumerate(row.app_rank_list, start=1)],
                              columns=["code", "rank"])
            aggregation_functions = {'rank': 'min'}
            raw_agg_df = _df.groupby(_df['code'], as_index=False)\
                .aggregate(aggregation_functions)\
                .reindex(columns=_df.columns)
            raw_agg_df_list.append(raw_agg_df)

        raw_agg_df_all = pd.concat(raw_agg_df_list, ignore_index=True, sort=False)
        app_id_mapping_df = self.get_code_app_id_mapping(list(set(raw_agg_df_all.code.tolist())))
        if app_id_mapping_df is not None:
            raw_agg_df_all = raw_agg_df_all.merge(app_id_mapping_df, on='code', how="left")
        raw_count = len(raw_agg_df_all[raw_agg_df_all.id.notnull()])
        raw_sum = raw_agg_df_all[raw_agg_df_all.id.notnull()]["rank"].sum()
        return raw_count, raw_sum


class MacRaw(AppStoreRankRawData):
    bucket_name = "prod_appannie_ios"
    bucket_path = "mac/country-ranks"
    data_split_str = "\t"
    rank_list_split_str = " "

    device_code = "mac-os-mac"
    market_code = 'apple-store'

    category_id_mapping = CATEGORY_ID_MAPPING['mac-os-mac']
    metric_mapping = METRIC_MAPPING['mac-os-mac']
    country_code_mapping = COUNTRY_CODE_MAPPING['ios']

    filename_available = [str(id) for id in COUNTRY_CODE_MAPPING['ios']]


# class IPhoneRaw(AppStoreRankRawData):
#     bucket_name = "prod_appannie_ios"
#     bucket_path = "country-ranks"
#     data_split_str = "\t"
#     rank_list_split_str = " "
#     accept_feeds = [0, 1, 2]
#
#
# class IPadRaw(AppStoreRankRawData):
#     bucket_name = "prod_appannie_ios"
#     bucket_path = "country-ranks"
#     data_split_str = "\t"
#     rank_list_split_str = " "

# df_temp =  MacRaw(spark).get_raw_data_by_date("2019-11-09")
print df_temp
MacRaw(spark).get('2019-11-09', 'US', 200000)


# self.parse_mapping(


In [0]:
%%sh

# aws s3 ls s3://prod_appannie_ios/mac/country-ranks/2019-06-07/23/
aws s3 ls s3://b2c-prod-data-pipeline-unified-store-free/unified/app-tech.store.app-rank.v1/fact/date=2019-06-05/device_code=mac-os-mac/

In [0]:

df = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-free/unified/app-tech.store.app-rank.v1/fact/date=2019-06-05/device_code=mac-os-mac/").toPandas()
print df
 

In [0]:
%%sh

ls -al /tmp/zeppelin_application_code/applications/db_check_v1/cases/store/app_rank_v1/

cat  /tmp/zeppelin_application_code/applications/db_check_v1/cases/store/app_rank_v1/reader.py

In [0]:
%%sh 


ls -al /var/log/pkg_install.log

cat /var/log/pkg_install.log

In [0]:
%%sh
