In [0]:


# Copyright (c) 2018 App Annie Inc. All rights reserved.

"""
DB Check modules
"""
import unittest

import datetime
import croniter
from sqlalchemy.dialects.postgresql import psycopg2
from aaintdatapipeline.application.app_qa.conf.settings import PRODUCT_SERVICE_ENDPOINT, \
    MICRO_SERVICE_ACCESS_ID, MICRO_SERVICE_SECRET_KEY, PG_AA_NAME, PG_AA_ACCESS_ID, PG_AA_HOSTS, PG_AA_SECRET_KEY, \
    PG_AA_AMAZON_NAME, PG_AA_AMAZON_ACCESS_ID, PG_AA_AMAZON_HOSTS, PG_AA_AMAZON_SECRET_KEY
from aaintdatapipeline.core.utils import microservice
from pyspark.sql import functions as F

from copy import deepcopy
from aaintdatapipeline.application.app_qa.common.db_check_utils import query
from aaintdatapipeline.application.app_qa.conf.settings import CITUS_MKT_NAME
from aaintdatapipeline.application.app_qa.conf.settings import CITUS_AA_CITUS_DB_NAME, \
    CITUS_AA_CITUS_DB_ACCESS_ID, CITUS_AA_CITUS_DB_HOSTS, CITUS_AA_CITUS_DB_SECRET_KEY
# from aaintdatapipeline.application.app_qa.db_check_v1.pyspark_test import PySparkTest
from aaintdatapipeline.core.conf import Conf
from aaintdatapipeline.core.fs.device import S3Bucket, specified_bucket
import zlib
import pandas as pd
import pandas.io.sql as sqlio
import psycopg2

# base_test.py
import os
import shutil
import unittest

from aaintdatapipeline.core.conf.settings import ROOT
from aaintdatapipeline.core.fs.device import meta_bucket, raw_bucket, unified_bucket
from aaintdatapipeline.core.fs.device.bucket import unified_data_system_config_bucket
from aaintdatapipeline.core.utils.commandline import env
from aaintdatapipeline.core.utils.encode import activate_system_utf8
from aaintdatapipeline.core.utils.spark import create_spark, eject_all_caches


class PipelineTest(unittest.TestCase):
    routing_config = None
    trigger_datetime = None
    prev_etl_datetime = None

    def __init__(self, methodName='runTest', trigger_datetime=None):
        super(PipelineTest, self).__init__(methodName)
        self.trigger_datetime = trigger_datetime or datetime.datetime.utcnow()
        self.check_date_str = self._get_check_date_str_from_routing_config(self.trigger_datetime)
        self.prev_etl_datetime = self._get_pre_etl_completed_date()

    def setUp(self):
        super(PipelineTest, self).setUp()
        self._verify_config()

    @classmethod
    def setUpClass(cls):
        super(PipelineTest, cls).setUpClass()
        activate_system_utf8()
        env(PYTHONIOENCODING='utf8')
        cls.spark = create_spark()
        cls.sc = cls.spark.sparkContext

    def _verify_config(cls):
        cls.assertIsNotNone(cls.routing_config)
        cls.assertIsNotNone(cls.trigger_datetime)
        cls.assertIsNotNone(cls.prev_etl_datetime)
        cls.assertIsNotNone(cls.check_date_str)

    def _get_check_date_str_from_routing_config(self, trigger_datetime):
        """
        return the date of : <days_delta> ago from previous scheduled date&time of <cron_time>.
        e.g.
        config = ("0 9 * * *", 1), today is 2019-10-27 8:00
        so previous scheduled date&time is 2019-10-26 9:00
        will return "2019-10-25"

        :param config: config format: (<cron_time>, <days_delta>)
        :type config: tuple
        the cron_time please refer to https://support.acquia.com/hc/en-us/articles/360004224494-Cron-time-string-format
        the days_delta is the days ago from the expected date.
        :return: date string of "%Y-%m-%d"
        :type return: str
        """
        schedule, days_delta = self.routing_config
        # here use UTC now
        cron = croniter.croniter(schedule, trigger_datetime)
        date = cron.get_prev(datetime.datetime) - datetime.timedelta(days=days_delta)
        return date.strftime("%Y-%m-%d")

    def _get_pre_etl_completed_date(self):
        schedule, _ = self.routing_config
        cron = croniter.croniter(schedule, self.trigger_datetime)
        date = cron.get_prev(datetime.datetime)
        return date


def rank_bucket(bucket_str):
    """
    Get Bucket Object from bucket string
    :param bucket_str: string like: s3://xxx.xxx
    :type bucket_str: str
    :return: bucket_object
    :rtype: bucket_object
    """
    conf = Conf(
        bucket_name=bucket_str,
        bucket_class=S3Bucket
    )
    return specified_bucket(conf)


class AppStoreRankRawData():
    bucket_name = ""
    bucket_path = ""
    data_split_str = ""
    rank_list_split_str = ""
    rank_split_str = ""
    accept_feeds = []
    country_code_mapping = {}
    category_id_mapping = {}
    metric_mapping = {}
    market_code = ""
    filename_available = []

    def get(self, date, country_code, category_id):
        df = self.get_raw_data_by_date(date)
        return df.loc[(df.country_code == country_code) & (df.category_id == category_id )]

    def parse_mapping(self, df):
        if self.country_code_mapping:
            df = df.replace({"country_code": self.country_code_mapping})

        if self.category_id_mapping:
            df["category_id"] = pd.to_numeric(df["category_id"])
            df = df.replace({"category_id": self.category_id_mapping})

        if self.metric_mapping:
            df["metric"] = pd.to_numeric(df["metric"])
            df = df.replace({"metric": self.metric_mapping})
        return df


    def parse_df_to_unified_format(self, df):
        columns = ["code", "category_id", "country_code",
                   "free_download", "paid_download", "revenue", "new_free_download", "new_paid_download" ]
        data_list = []
        for index,row in df.iterrows():
            if len(row.app_rank_list)==0:
                continue
            for index,app_id in enumerate(row.app_rank_list, start=1):
                free_download = index if row.metric == 'free_download' else None
                paid_download = index if row.metric == 'paid_download' else None
                revenue = index if row.metric == 'revenue' else None
                new_free_download = index if row.metric == 'new_free_download' else None
                new_paid_download = index if row.metric == 'new_paid_download' else None
                data_list.append([app_id, row.category_id, row.country_code,
                               free_download, paid_download, revenue, new_free_download, new_paid_download])
        new_df = pd.DataFrame(data_list, columns=columns)
        aggregate_dict = { metric : 'min' for metric in APP_STORE_RANK_METRICS}
        new_df = new_df.groupby(new_df['code'], as_index=False).aggregate(aggregate_dict).\
            reindex(columns=new_df.columns)
        # code to app_ids
        code_app_id_mapping = self.get_code_app_id_mapping(new_df.code.tolist())
        new_df = new_df.merge(code_app_id_mapping, on='code', how="left").rename(columns={'id':'app_id'})
        return new_df

    def get_unified_format(self, date):
        df = self.get_raw_data_by_date(date)
        return self.parse_df_to_unified_format(df)

    def get_raw_data_by_date(self, date):
        """
        :return: raw_data_frame
        :rtype: list_of_dic
        raw_data:
        _________________________________________________________________________________
        |    date    |   country_id   |  category_id  |   feed_id   |   rank (app_id)   |
        |------------|----------------|---------------|-------------|-------------------|
        | 2019-04-27 | 143441(bigint) |   6016 (int)  |   0 (int)   | 376510438(bigint) |
        ---------------------------------------------------------------------------------
        unified_data:
        _____________________________________________________________________________________
        |  country_code  |   category_id   |       app_id       | feed_name (free_download) |
        |----------------|-----------------|--------------------|---------------------------|
        |      'US'      | 100026 (bigint) | 376510438 (bigint) |    25 (int) (app_rank)    |
        -------------------------------------------------------------------------------------
        """
        path = "{_bucket_path}/{_date}/23/".format(_date=date, _bucket_path=self.bucket_path)
        bucket = rank_bucket(self.bucket_name)
        columns = ['date', 'country_code', 'category_id', 'metric', 'app_rank_list']

        _raw_data_list = []
        for filepath in bucket.list(path):
            filename = filepath.replace(path, '')
            if self.filename_available and filename not in self.filename_available:
                continue
            _raw_data = zlib.decompress(bucket.get(filepath))
            for _line in _raw_data.splitlines():
                line_data = _line.split(self.data_split_str)
                app_rank_list = [rank_app for rank_app in line_data[4].split(self.rank_list_split_str) if
                                 rank_app.strip() != '']
                if self.rank_split_str:
                    app_rank_list = [app_rank.split(self.rank_split_str)[1] for app_rank in app_rank_list]
                line_data[4] = app_rank_list
                _raw_data_list.append(line_data)
        return self.parse_mapping(pd.DataFrame(_raw_data_list, columns=columns))


    def get_code_app_id_mapping(self, code_ids):
        return None

    def get_rank_count_and_sum_by_date(self, date):
        raw_df = self.get_raw_data_by_date(date)
        raw_agg_df_list = []
        for df_index, row in raw_df.iterrows():
            if len(row.app_rank_list) == 0:
                continue
            _df = pd.DataFrame([[app_id, app_index] for app_index, app_id in enumerate(row.app_rank_list, start=1)],
                              columns=["code", "rank"])
            aggregation_functions = {'rank': 'min'}
            raw_agg_df = _df.groupby(_df['code'], as_index=False)\
                .aggregate(aggregation_functions)\
                .reindex(columns=_df.columns)
            raw_agg_df_list.append(raw_agg_df)

        raw_agg_df_all = pd.concat(raw_agg_df_list, ignore_index=True, sort=False)
        app_id_mapping_df = self.get_code_app_id_mapping(list(set(raw_agg_df_all.code.tolist())))
        if app_id_mapping_df is not None:
            raw_agg_df_all = raw_agg_df_all.merge(app_id_mapping_df, on='code', how="left")
        raw_count = len(raw_agg_df_all[raw_agg_df_all.id.notnull()])
        raw_sum = raw_agg_df_all[raw_agg_df_all.id.notnull()]["rank"].sum()
        return raw_count, raw_sum

# class IPhoneRaw(AppStoreRankRawData):
#     bucket_name = "prod_appannie_ios"
#     bucket_path = "country-ranks"
#     data_split_str = "\t"
#     rank_list_split_str = " "
#     accept_feeds = [0, 1, 2]
#
#
# class IPadRaw(AppStoreRankRawData):
#     bucket_name = "prod_appannie_ios"
#     bucket_path = "country-ranks"
#     data_split_str = "\t"
#     rank_list_split_str = " "


# class MacRaw(AppStoreRankRawData):
#     bucket_name = "prod_appannie_ios"
#     bucket_path = "mac/country-ranks"
#     data_split_str = "\t"
#     rank_list_split_str = " "

METRIC_MAPPING = {
    'tv-os-tv': {
        0: 'free_download',
        1: 'paid_download',
        2: 'revenue'
    },
    'amazon-store':{
        'android-all':{
            0: 'free_download',
            1: 'paid_download',
            3: 'new_free_download'
        }
    }

}
COUNTRY_CODE_MAPPING = {
    'ios': {143441: 'US', 143442: 'FR', 143443: 'DE', 143444: 'GB', 143445: 'AT',
            143446: 'BE', 143447: 'FI', 143448: 'GR', 143449: 'IE', 143450: 'IT',
            143451: 'LU', 143452: 'NL', 143453: 'PT', 143454: 'ES', 143455: 'CA',
            143456: 'SE', 143457: 'NO', 143458: 'DK', 143459: 'CH', 143460: 'AU',
            143461: 'NZ', 143462: 'JP', 143463: 'HK', 143464: 'SG', 143465: 'CN',
            143466: 'KR', 143467: 'IN', 143468: 'MX', 143469: 'RU', 143470: 'TW',
            143471: 'VN', 143472: 'ZA', 143473: 'MY', 143474: 'PH', 143475: 'TH',
            143476: 'ID', 143477: 'PK', 143478: 'PL', 143479: 'SA', 143480: 'TR',
            143481: 'AE', 143482: 'HU', 143483: 'CL', 143484: 'NP', 143485: 'PA',
            143486: 'LK', 143487: 'RO', 143489: 'CZ', 143491: 'IL', 143492: 'UA',
            143493: 'KW', 143494: 'HR', 143495: 'CR', 143496: 'SK', 143497: 'LB',
            143498: 'QA', 143499: 'SI', 143501: 'CO', 143502: 'VE', 143503: 'BR',
            143504: 'GT', 143505: 'AR', 143506: 'SV', 143507: 'PE', 143508: 'DO',
            143509: 'EC', 143510: 'HN', 143511: 'JM', 143512: 'NI', 143513: 'PY',
            143514: 'UY', 143515: 'MO', 143516: 'EG', 143517: 'KZ', 143518: 'EE',
            143519: 'LV', 143520: 'LT', 143521: 'MT', 143523: 'MD', 143524: 'AM',
            143525: 'BW', 143526: 'BG', 143528: 'JO', 143529: 'KE', 143530: 'MK',
            143531: 'MG', 143532: 'ML', 143533: 'MU', 143534: 'NE', 143535: 'SN',
            143536: 'TN', 143537: 'UG', 143538: 'AI', 143539: 'BS', 143540: 'AG',
            143541: 'BB', 143542: 'BM', 143543: 'VG', 143544: 'KY', 143545: 'DM',
            143546: 'GD', 143547: 'MS', 143548: 'KN', 143549: 'LC', 143550: 'VC',
            143551: 'TT', 143552: 'TC', 143553: 'GY', 143554: 'SR', 143555: 'BZ',
            143556: 'BO', 143557: 'CY', 143558: 'IS', 143559: 'BH', 143560: 'BN',
            143561: 'NG', 143562: 'OM', 143563: 'DZ', 143564: 'AO', 143565: 'BY',
            143566: 'UZ', 143568: 'AZ', 143571: 'YE', 143572: 'TZ', 143573: 'GH',
            143575: 'AL', 143576: 'BJ', 143577: 'BT', 143578: 'BF', 143579: 'KH',
            143580: 'CV', 143581: 'TD', 143582: 'CG', 143583: 'FJ', 143584: 'GM',
            143585: 'GW', 143586: 'KG', 143587: 'LA', 143588: 'LR', 143589: 'MW',
            143590: 'MR', 143591: 'FM', 143592: 'MN', 143593: 'MZ', 143594: 'NA',
            143595: 'PW', 143597: 'PG', 143598: 'ST', 143599: 'SC', 143600: 'SL',
            143601: 'SB', 143602: 'SZ', 143603: 'TJ', 143604: 'TM', 143605: 'ZW',
            0: 'WW'},
    'amazon-store':{
        'android-all':{
            'UK' : 'GB',
        }
    }
}
CATEGORY_ID_MAPPING = {
    "tv-os-tv": {
        36: 300000,
        360: 300001,
        6004: 300002,
        6009: 300003,
        6012: 300004,
        6013: 300005,
        6014: 300006,
        6016: 300007,
        6017: 300008
    },
    'amazon-store':{
        'android-all': {
        10: 720000,
        6395703011: 720001,
        6395710011: 720003,
        6395709011: 720004,
        6395711011: 720005,
        6395712011: 720006,
        6395713011: 720007,
        6395702011: 720008,
        6395714011: 720009,
        6395715011: 720010,
        6395716011: 720011,
        6395717011: 720012,
        6395720011: 720015,
        6395726011: 720016,
        6395718011: 720017,
        6395727011: 720019,
        6395729011: 720020,
        6395745011: 720021,
        6395728011: 720022,
        6395730011: 720024,
        6395731011: 720025,
        6395732011: 720026,
        6395733011: 720028,
        6395734011: 720029,
        6395735011: 720030,
        6395736011: 720031,
        6395737011: 720032,
        6395738011: 720033,
        6395747011: 720034,
        6395744011: 720035,
        8: 720000,
        148152071: 720001,
        148153071: 720003,
        148154071: 720004,
        148155071: 720005,
        148156071: 720006,
        148157071: 720007,
        148158071: 720008,
        148159071: 720009,
        148160071: 720010,
        148161071: 720011,
        148162071: 720012,
        148165071: 720013,
        148163071: 720015,
        148164071: 720016,
        148166071: 720018,
        148167071: 720019,
        148168071: 720020,
        148169071: 720021,
        152899071: 720022,
        148170071: 720024,
        148171071: 720025,
        148172071: 720026,
        148173071: 720028,
        148174071: 720029,
        148175071: 720030,
        148176071: 720031,
        148177071: 720032,
        148178071: 720033,
        148180071: 720034,
        148181071: 720035,
        2: 720000,
        1720677031: 720001,
        1720683031: 720002,
        1720684031: 720003,
        1720685031: 720004,
        1720686031: 720005,
        1720687031: 720006,
        1720688031: 720007,
        1720689031: 720009,
        1720690031: 720010,
        1720691031: 720011,
        1720692031: 720012,
        1720693031: 720013,
        1720712031: 720014,
        1720694031: 720015,
        1720700031: 720016,
        1720701031: 720018,
        1720702031: 720019,
        1720703031: 720020,
        1720704031: 720021,
        1720705031: 720022,
        1720706031: 720023,
        1720707031: 720025,
        1720708031: 720026,
        1720714031: 720027,
        1720709031: 720028,
        1720710031: 720029,
        1720711031: 720030,
        1720713031: 720032,
        1720724031: 720034,
        1720725031: 720035,
        3: 720000,
        1726743031: 720001,
        1726749031: 720002,
        1726750031: 720003,
        1726751031: 720004,
        1726752031: 720005,
        1726753031: 720006,
        1726754031: 720007,
        1726755031: 720009,
        1726756031: 720010,
        1726757031: 720011,
        1726758031: 720012,
        1726759031: 720013,
        1726778031: 720014,
        1726760031: 720015,
        1726766031: 720016,
        1726767031: 720018,
        1726768031: 720019,
        1726769031: 720020,
        1726770031: 720021,
        1726771031: 720022,
        1726772031: 720023,
        1726773031: 720025,
        1726774031: 720026,
        1726775031: 720028,
        1726776031: 720029,
        1726777031: 720030,
        1726779031: 720032,
        1726780031: 720033,
        1726790031: 720034,
        1726791031: 720035,
        4: 720000,
        1722761031: 720001,
        1722767031: 720002,
        1722768031: 720003,
        1722769031: 720004,
        1722770031: 720005,
        1722771031: 720006,
        1722772031: 720007,
        1722773031: 720009,
        1722774031: 720010,
        1722775031: 720011,
        1722776031: 720012,
        1722777031: 720013,
        1722796031: 720014,
        1722778031: 720015,
        1722784031: 720016,
        1722785031: 720018,
        1722786031: 720019,
        1722787031: 720020,
        1722788031: 720021,
        1722789031: 720022,
        1722790031: 720023,
        1722791031: 720025,
        1722792031: 720026,
        1722793031: 720028,
        1722794031: 720029,
        1722795031: 720030,
        1722797031: 720032,
        1722798031: 720033,
        1722808031: 720034,
        1722809031: 720035,
        5: 720000,
        1725417031: 720001,
        1725423031: 720002,
        1725424031: 720003,
        1725425031: 720004,
        1725426031: 720005,
        1725427031: 720006,
        1725428031: 720007,
        1725429031: 720009,
        1725430031: 720010,
        1725431031: 720011,
        1725432031: 720012,
        1725433031: 720013,
        1725452031: 720014,
        1725434031: 720015,
        1725440031: 720016,
        1725441031: 720018,
        1725442031: 720019,
        1725443031: 720020,
        1725444031: 720021,
        1725445031: 720022,
        1725446031: 720023,
        1725447031: 720025,
        1725448031: 720026,
        1725449031: 720028,
        1725450031: 720029,
        1725451031: 720030,
        1725453031: 720032,
        1725454031: 720033,
        1725464031: 720034,
        1725465031: 720035,
        6: 720000,
        2386858051: 720001,
        2386869051: 720002,
        2386864051: 720003,
        2386865051: 720004,
        2386866051: 720005,
        2386867051: 720006,
        2386868051: 720007,
        2386870051: 720009,
        2386871051: 720010,
        2386872051: 720011,
        2386873051: 720012,
        2386874051: 720013,
        2386892051: 720014,
        2386875051: 720015,
        2386881051: 720016,
        2386882051: 720018,
        2386883051: 720019,
        2386884051: 720020,
        2386885051: 720021,
        2386894051: 720022,
        2386886051: 720023,
        2386887051: 720025,
        2386888051: 720026,
        2386889051: 720028,
        2386890051: 720029,
        2386891051: 720030,
        2386893051: 720032,
        2386895051: 720033,
        2386905051: 720034,
        2386906051: 720035,
        7: 720000,
        1710348031: 720001,
        1710359031: 720002,
        1710354031: 720003,
        1710355031: 720004,
        1710356031: 720005,
        1710357031: 720006,
        1710358031: 720007,
        1710360031: 720009,
        1710361031: 720010,
        1710362031: 720011,
        1710363031: 720012,
        1710364031: 720013,
        1710383031: 720014,
        1710365031: 720015,
        1710371031: 720016,
        1710372031: 720018,
        1710373031: 720019,
        1710374031: 720020,
        1710375031: 720021,
        1710376031: 720022,
        1710377031: 720023,
        1710378031: 720025,
        1710379031: 720026,
        1710380031: 720028,
        1710381031: 720029,
        1710382031: 720030,
        1710384031: 720032,
        1710385031: 720033,
        1710395031: 720034,
        1710396031: 720035,
        1: 720000,
        2478833011: 720001,
        2478832011: 720002,
        2478840011: 720003,
        2478839011: 720004,
        2478841011: 720005,
        2478842011: 720006,
        2478843011: 720007,
        2478844011: 720009,
        2478845011: 720010,
        2478846011: 720011,
        2478847011: 720012,
        2478867011: 720014,
        2478849011: 720015,
        2478855011: 720016,
        3310778011: 720017,
        2478858011: 720019,
        2478860011: 720020,
        2577638011: 720021,
        2478859011: 720022,
        2478861011: 720024,
        2478862011: 720025,
        2478863011: 720026,
        2478864011: 720028,
        2478865011: 720029,
        2478866011: 720030,
        2478868011: 720032,
        2478869011: 720033,
        3310683011: 720034,
        2478875011: 720035
        }
    }
}


class AppleTvRaw(AppStoreRankRawData):
    bucket_name = "prod_appannie_appletv"
    bucket_path = "country-ranks"
    data_split_str = ","
    rank_list_split_str = " "
    rank_split_str = "-"
    country_code_mapping = None
    category_id_mapping = CATEGORY_ID_MAPPING['tv-os-tv']
    metric_mapping = METRIC_MAPPING['tv-os-tv']

    device_code = "tv-os-tv"
    market_code = 'apple-store'
    filename_available = [COUNTRY_CODE_MAPPING['ios'][id] for id in COUNTRY_CODE_MAPPING['ios']]

    # this should be a product_service
    def get_code_app_id_mapping(self, code_list):
        sql_code_list = "','".join(code_list)
        sql = "select id,code from product where code in ('{}')".format(sql_code_list)
        conn = psycopg2.connect(aa_dsn)
        df = sqlio.read_sql_query(sql, conn)
        return df


class AmazonRaw(AppStoreRankRawData):
    bucket_name = "prod_appannie_amazon"
    bucket_path = "country-ranks"
    data_split_str = ","
    rank_list_split_str = " "
    rank_split_str = "-"
    device_code = "android-all"
    market_code = 'amazon-store'

    filename_available = ['FR', 'CN', 'CA', 'DE', 'JP', 'IT', 'US', 'UK', 'ES']

    country_code_mapping = COUNTRY_CODE_MAPPING[market_code][device_code]
    category_id_mapping = CATEGORY_ID_MAPPING[market_code][device_code]
    metric_mapping = METRIC_MAPPING[market_code][device_code]

    # this should be a product_service
    def get_code_app_id_mapping(self, code_list):
        sql_code_list = "','".join(code_list)
        sql = "select id,code from app where code in ('{}')".format(sql_code_list)
        conn = psycopg2.connect(aa_amazon_dsn)
        df = sqlio.read_sql_query(sql, conn)
        return df

    def get_rank_count_and_sum_by_date(self, date):
        raw_df = self.get_raw_data_by_date(date)
        raw_agg_df_list = []
        for df_index, row in raw_df.iterrows():
            if len(row.app_rank_list) == 0:
                continue
            _df = pd.DataFrame([[app_id, app_index] for app_index, app_id in enumerate(row.app_rank_list, start=1)],
                              columns=["code", "rank"])
            aggregation_functions = {'rank': 'min'}
            raw_agg_df = _df.groupby(_df['code'], as_index=False)\
                .aggregate(aggregation_functions)\
                .reindex(columns=_df.columns)
            raw_agg_df_list.append(raw_agg_df)

        raw_agg_df_all = pd.concat(raw_agg_df_list, ignore_index=True, sort=False)
        app_id_mapping_df = self.get_code_app_id_mapping(list(set(raw_agg_df_all.code.tolist())), aa_amazon_dsn)
        raw_agg_df_all = raw_agg_df_all.merge(app_id_mapping_df, on='code', how="left")
        raw_count = len(raw_agg_df_all[raw_agg_df_all.id.notnull()])
        raw_sum = raw_agg_df_all[raw_agg_df_all.id.notnull()]["rank"].sum()
        return raw_count, raw_sum


# class GooglePlayRaw(AppStoreRankRawData):
#     bucket_name = "prod_appannie_android"
#     bucket_path = "country-ranks"
#     data_split_str = ","
#     rank_list_split_str = " "
#     accept_feeds = [0, 1, 2]


# CONSTANTS
APP_STORE_RANK_METRICS = ["free_download", "new_paid_download", "revenue", "paid_download", "new_free_download"]
citus_dsn = (
    "dbname='{db}' user='{user}' password='{password}' "
    "host='{host}' port='{port}'".format(
        db=CITUS_AA_CITUS_DB_NAME,
        user=CITUS_AA_CITUS_DB_ACCESS_ID,
        host=CITUS_AA_CITUS_DB_HOSTS[0][0],
        password=CITUS_AA_CITUS_DB_SECRET_KEY,
        port=CITUS_AA_CITUS_DB_HOSTS[0][1]
    )
)
aa_dsn = (
    "dbname='{db}' user='{user}' password='{password}' "
    "host='{host}' port='{port}'".format(
        db=PG_AA_NAME,
        user=PG_AA_ACCESS_ID,
        host=PG_AA_HOSTS[0][0],
        password=PG_AA_SECRET_KEY,
        port=PG_AA_HOSTS[0][1]
    )
)

aa_amazon_dsn = (
    "dbname='{db}' user='{user}' password='{password}' "
    "host='{host}' port='{port}'".format(
        db=PG_AA_AMAZON_NAME,
        user=PG_AA_AMAZON_ACCESS_ID,
        host=PG_AA_AMAZON_HOSTS[0][0],
        password=PG_AA_AMAZON_SECRET_KEY,
        port=PG_AA_AMAZON_HOSTS[0][1]
    )
)




class AppleTvUnified():
    device_code = 'tv-os-tv'
    s3_path = "s3://b2c-prod-data-pipeline-unified-store-free/unified/app-tech.store.app-rank.v1/fact/"

    def get(self, date, country_code, category_id):
        df = self.get_raw_df_by_date(date)
        return df.filter("country_code='{country_code}' and category_id='{category_id}'".format(
            country_code=country_code, category_id=category_id)).toPandas()

    def get_raw_df_by_date(self, date):
        return spark.read.parquet("{}date={}/device_code={}/".format(self.s3_path, date, self.device_code))

    def get_rank_count_and_sum_by_date(self, date):
        df = self.get_raw_df_by_date(date)
        df_agg = df.filter('app_id is not null').agg(
            F.count("free_download").alias("free_download_count"),
            F.count("paid_download").alias("paid_download_count"),
            F.count("revenue").alias("revenue_count"),
            F.count("new_free_download").alias("new_free_download_count"),
            F.count("new_paid_download").alias("new_paid_download_count"),
            F.sum("free_download").alias("free_download_sum"),
            F.sum("paid_download").alias("paid_download_sum"),
            F.sum("revenue").alias("revenue_sum"),
            F.sum("new_free_download").alias("new_free_download_sum"),
            F.sum("new_paid_download").alias("new_paid_download_sum")
        ).collect()

        data = df_agg[0]
        unified_count = sum([data.free_download_count, data.paid_download_count, data.revenue_count, data.new_free_download_count, data.new_paid_download_count])
        unified_sum = sum([data.free_download_sum or 0, data.paid_download_sum or 0, data.revenue_sum or 0, data.new_free_download_sum or 0, data.new_paid_download_sum or 0])
        return unified_count, unified_sum


class AmazonUnified(AppleTvUnified):
    device_code = 'android-all'
    s3_path = "s3://b2c-prod-data-pipeline-unified-store-free/unified/app-tech.store.app-rank.v2/fact/"

class AppleTvDB():
    schema = "store"
    table = "store_app_rank_fact_v1"
    device_code = 'tv-os-tv'

    def get(self, date, country_code, category_id):
        sql = "SELECT * from {schema}.{table} where date ='{date}' AND device_code='{device_code}' AND " \
              "country_code='{country_code}' AND category_id='{category_id}'".format(
            schema=self.schema, table=self.table, date=date, device_code=self.device_code,
            country_code=country_code, category_id=category_id)
        result = self.query_df(citus_dsn, sql)
        return result

    @staticmethod
    def query(dsn, sql):
        with psycopg2.connect(dsn) as conn:
            conn.autocommit = True
            with conn.cursor() as cur:
                cur.execute(sql)
                df = pd.DataFrame(cur.fetchall())
                df.columns = cur.keys()
                conn.commit()
        return df

    @staticmethod
    def query_df(dsn, sql):
        conn = psycopg2.connect(dsn)
        df = sqlio.read_sql_query(sql, conn)
        return df

class AmazonDB(AppleTvDB):
    device_code = 'android-all'


# CASES

class AppStoreRankDailyTest(PipelineTest):
    routing_config = ('* 9 * * *', 1)

    def test_apple_tv_etl_process(self):
        country_code = 'US'
        category_id = 300000
        raw_df = AppleTvRaw().parse_df_to_unified_format(AppleTvRaw().get(self.check_date_str, country_code, category_id))
        unified_df = AppleTvUnified().get(self.check_date_str, country_code, category_id)
        db_df = AppleTvDB().get(self.check_date_str,  country_code, category_id)
        for metric in APP_STORE_RANK_METRICS:
            _raw_df = raw_df[pd.notnull(raw_df[metric])].sort_values(metric)
            _unified_df = unified_df[pd.notnull(unified_df[metric])].sort_values(metric)
            _db_df = db_df[pd.notnull(db_df[metric])].sort_values(metric)

            data_app = {
                'raw': _raw_df.app_id.tolist(),
                'unified': _unified_df.app_id.tolist(),
                'db': _db_df.app_id.tolist()
            }

            data_rank = {
                'raw': _raw_df[metric].tolist(),
                'unified': _unified_df[metric].tolist(),
                'db': db_df[metric].tolist()
            }

            self.assertTrue(data_app['raw']==data_app['unified']==data_app['db'], msg="{}".format(data_app))
            self.assertTrue(data_rank['raw']==data_rank['unified']==data_rank['db'], msg="{}".format(data_rank))

    def test_amazon_etl_process(self):
        country_code = 'US'
        category_id = 720000
        raw_df = AmazonRaw().parse_df_to_unified_format(AmazonRaw().get(self.check_date_str, country_code, category_id))
        unified_df = AmazonUnified().get(self.check_date_str, country_code, category_id)

        for metric in APP_STORE_RANK_METRICS:
            _raw_df = raw_df[pd.notnull(raw_df[metric])].sort_values(metric)
            _unified_df = unified_df[pd.notnull(unified_df[metric])].sort_values(metric)
            # _db_df = db_df[pd.notnull(db_df[metric])].sort_values(metric)

            data_app = {
                'raw': _raw_df.app_id.tolist(),
                'unified': _unified_df.app_id.tolist(),
                # 'db': _db_df.app_id.tolist()
                'db': []
            }

            data_rank = {
                'raw': _raw_df[metric].tolist(),
                'unified': _unified_df[metric].tolist(),
                # 'db': db_df[metric].tolist()
                'db': []
            }

            self.assertTrue(data_app['raw'] == data_app['unified'] == data_app['db'], msg="{}".format(data_app))
            self.assertTrue(data_rank['raw'] == data_rank['unified'] == data_rank['db'], msg="{}".format(data_rank))

    def test_amazon_completeness(self):
        raw_count, raw_sum = AmazonRaw().get_rank_count_and_sum_by_date(self.check_date_str)
        unified_count, unified_sum = AmazonUnified().get_rank_count_and_sum_by_date(self.check_date_str)

        self.assertEqual(raw_count, unified_count)
        self.assertEqual(raw_sum, unified_sum)

    def test_apple_tv_completeness(self):
        raw_count, raw_sum = AppleTvRaw().get_rank_count_and_sum_by_date(self.check_date_str)
        unified_count, unified_sum = AppleTvUnified().get_rank_count_and_sum_by_date(self.check_date_str)

        self.assertEqual(raw_count, unified_count)
        self.assertEqual(raw_sum, unified_sum)







In [0]:



import datetime

def logtime():
    print datetime.datetime.now()

def get_sum_count_raw(date_str):
    raw_reader = AmazonRaw()
    raw_reader.category_id_mapping = None
    raw_df = raw_reader.get_raw_data_by_date(date_str)

    columns = ["app_id", "category_id", "country_code",
               "free_download", "paid_download", "revenue", "new_free_download", "new_paid_download", ]

    amazon_count = 0
    amazon_count_dict = {}
    amazon_sum = 0
    df_new_list = []
    for df_index,row in raw_df.iterrows():
        # print df_index
        country_code = row.country_code if row.country_code!='UK' else 'GB'
        legacy_category_id = row.category_id
        if len(row.app_rank_list)==0:
            continue
        if country_code not in amazon_count_dict:
            amazon_count_dict[country_code] = {}
        if legacy_category_id not in amazon_count_dict[country_code]:
            amazon_count_dict[country_code][legacy_category_id] = 0
        df = pd.DataFrame([[app_id,app_index] for app_index,app_id in enumerate(row.app_rank_list, start=1)], columns=[ "code", "rank"]) # if app_id in app_df_list
        aggregation_functions = {'rank': 'min'}
        df_new = df.groupby(df['code'],as_index=False).aggregate(aggregation_functions).reindex(columns=df.columns)
        df_new_list.append(df_new)
        continue # bellow for breakdown test
        app_id_mapping_df =AmazonRaw().get_code_app_id_mapping(df_new.code.tolist(), aa_amazon_dsn)
        df_new = df_new.merge(app_id_mapping_df, on='code', how="left")
        amazon_count += len(df_new[df_new.id.notnull()])
        amazon_count_dict[country_code][legacy_category_id] += len(df_new[df_new.id.notnull()])
        amazon_sum += df_new[df_new.id.notnull()]["rank"].sum()
    
    df_new = pd.concat(df_new_list, ignore_index=True, sort =False)
    app_id_mapping_df =AmazonRaw().get_code_app_id_mapping(list(set(df_new.code.tolist())), aa_amazon_dsn)
    df_new = df_new.merge(app_id_mapping_df, on='code', how="left")
    amazon_count = len(df_new[df_new.id.notnull()])
    amazon_sum = df_new[df_new.id.notnull()]["rank"].sum()
    
    return amazon_count, amazon_sum, amazon_count_dict
    

print get_sum_count_raw('2019-10-11')

In [0]:

#315541, 56075482
from pyspark.sql import functions as F


def get_sum_count_unified(date_str):
    df = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-free/unified/app-tech.store.app-rank.v2/fact/date={}/device_code=android-all/".format(date_str))

    ############## all 0-3 should be transformed, 2 is not load to db
    df_unified = df.filter('app_id is not null').groupBy(["country_code","legacy_category_id"]).agg(F.count("free_download").alias("free_download_count"),F.count("paid_download").alias("paid_download_count"), F.count("revenue").alias("revenue_count"), F.count("new_free_download").alias("new_free_download_count"), F.sum("free_download").alias("free_download_sum"),F.sum("paid_download").alias("paid_download_sum"),  F.sum("revenue").alias("revenue_sum"), F.sum("new_free_download").alias("new_free_download_sum")).collect()
    
    unified_count = 0
    unified_sum = 0
    
    amazon_count_dict = {}
    for data in df_unified:
        country_code = data.country_code
        legacy_category_id = str(data.legacy_category_id)
        if country_code not in amazon_count_dict:
            amazon_count_dict[country_code] = {}
        if legacy_category_id not in amazon_count_dict[country_code]:
            amazon_count_dict[country_code][legacy_category_id] = 0
        amazon_count_dict[country_code][legacy_category_id] += sum([data.free_download_count, data.paid_download_count, data.new_free_download_count, data.revenue_count])
        unified_count += sum([data.free_download_count, data.paid_download_count, data.new_free_download_count, data.revenue_count])
        unified_sum += sum([data.free_download_sum or 0, data.paid_download_sum or 0, data.new_free_download_sum or 0, data.revenue_sum or 0])

    return unified_count, unified_sum, amazon_count_dict


get_sum_count_unified('2013-03-27')


In [0]:


def get_date_list(begin_date, end_date, freq):
    date_list = [x.strftime('%Y-%m-%d') for x in list(pd.date_range(start=begin_date, end=end_date, freq=freq))]
    return date_list

# begin_date = datetime.datetime.strptime("2012-09-12", '%Y-%m-%d')
# end_date = datetime.datetime.strptime("2019-12-11", '%Y-%m-%d')


begin_date = datetime.datetime.strptime("2012-09-12", '%Y-%m-%d')
end_date = datetime.datetime.strptime("2019-12-20", '%Y-%m-%d')

DATE_LIST = ["2012-09-22","2013-03-07","2013-03-27"]


print "%table date\tu_count\tu_sum"

for date_str in DATE_LIST:
    try:
        # r_count, r_sum, _ = get_sum_count_raw(date_str)
        u_count, u_sum = AmazonUnified().get_rank_count_and_sum_by_date(date_str)
        # count_diff = r_count - u_count
        # sum_diff = r_sum - u_sum
        # print "{}\t{}\t{}".format(date_str, count_diff, sum_diff)
        print "{}\t{}\t{}".format(date_str, u_count, u_sum)
    except Exception as e:
        print "{}\t{}\t{}".format(date_str, -10, -10 )
    

In [0]:
%%sh

aws s3 ls s3://b2c-prod-data-pipeline-unified-store-free/unified/app-tech.store.app-rank.v2/fact/


In [0]:
%%sh
