In [0]:


#!/usr/bin/env python
# coding=utf-8
# Copyright (c) 2017 App Annie Inc. All rights reserved.
"""

Implement test cases here
"""

import os
import shutil
import unittest

from aaintdatapipeline.core.conf.settings import ROOT
from aaintdatapipeline.core.fs.device import meta_bucket, raw_bucket, unified_bucket
from aaintdatapipeline.core.fs.device.bucket import unified_data_system_config_bucket
from aaintdatapipeline.core.utils.commandline import env
from aaintdatapipeline.core.utils.encode import activate_system_utf8
from aaintdatapipeline.core.utils.spark import create_spark, eject_all_caches


class PySparkTest(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        """Setup a basic Spark context for testing"""
        activate_system_utf8()
        env(PYTHONIOENCODING='utf8')
        cls.spark = create_spark()
        cls.sc = cls.spark.sparkContext
        cls._release_resource()

    @classmethod
    def tearDownClass(cls):
        cls._release_resource()
        cls.sc = None
        # comment below code for performance
        # cls.spark.stop()

    @classmethod
    def _release_resource(cls):
        # clear spark cached and persisted files
        eject_all_caches(cls.spark)
        # clear physical files
        cls._empty_bucket(meta_bucket())
        cls._empty_bucket(raw_bucket())
        cls._empty_bucket(unified_bucket())
        cls._empty_bucket(unified_data_system_config_bucket())
        cls._empty_spark_db_dir("{}/metastore_db".format(ROOT))
        cls._empty_spark_db_dir("{}/spark-warehouse".format(ROOT))

    @classmethod
    def _empty_bucket(cls, bucket):
        try:
            if os.environ.get('EMPTY_BUCKET', "true").lower() != "false":
                bucket.empty()
        except OSError:
            pass

    @classmethod
    def _empty_spark_db_dir(cls, db_dir):
        try:
            shutil.rmtree(db_dir, ignore_errors=False)
        except OSError:
            pass


In [0]:

import unittest
import sys
import time
import datetime

class TestDemo(PySparkTest):
    checkdate = "2019-10-11"
    granularity = "weekly"
    
    def test_fail1(self):
        print "fail1"
        assert False==True
        
    def test_fail2(self):
        print "fail2"
        assert False==True

    def test_fail3(self):
        print "fail3"
        assert False==True
        
    def test_pass(self):
        print "pass"
        assert True==True
        
    def test_pass_log(self):
        print "pass log"
        assert True==True
        
    def test_fail_log(self):
        print "tom is working"
        print "fail"
        assert False==True
        
    def test_spark_log(self):
        spark.read.parquet("s3://b2c-prod-advanced-review/oss/ADVRVW_TOPIC_EFFECTIVE_PREDICTIONS/version=1.0.0/platform=2/process_date=2019-10-14/country=AU/language=en/part-00000-fab743d5-e8ff-4f23-86ff-92bacb3fdccd-c000.snappy.parquet").show()
        assert False==True
        
    def test_check_date_with_doc_string(self):
        """DEMO CHECK DATE{}"""
        utcnow = datetime.datetime.utcnow() if not self.trigger_date else self.trigger_date
        print utcnow
        assert False==True

    def test_set_doc_string(self):
        """Mobile Web Led"""
        assert False==True
        
    def shortDescription(self):
        """Returns a one-line description of the test, or None if no
        description has been provided.

        The default implementation of this method returns the first line of
        the specified test method's docstring.
        """
        _doc = self._testMethodDoc
        if not _doc:
            _doc = self.__doc__ or ""
        doc = _doc.split("\n")[0].strip()
        checkdate = self.checkdate
        granularity = self.granularity
        return "{} - {} - {}".format(doc, granularity, checkdate )

        


In [0]:



def debug(case):
    std_out_origin= sys.stdout
    try:
        suite =  unittest.TestSuite()
        suite.addTest(case)
        runner = unittest.TextTestRunner(verbosity=2, buffer=True)
        runner.run(suite)
    finally:
        sys.stdout = std_out_origin
    

testcase = TestDemo("test_set_doc_string")
debug(testcase)


In [0]:


std_out_origin_tom= sys.stdout

log_file = "/tmp/test.log"
with open(log_file, "w+") as file:
    suite =  unittest.TestSuite()
    suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestDemo))
    runner = unittest.TextTestRunner(file, verbosity=2, buffer=True)
    runner.run(suite)



In [0]:
%%sh
cat /tmp/test.log

In [0]:


sys.stdout = std_out_origin


In [0]:

from bdce.common.utils import update_application_code
update_application_code(spark, role="BDP-PROD-APP-INT-QA", application_name="qa-data-db-check-debug")


In [0]:
%%sh

echo $PYTHONPATH


In [0]:
%%sh
ls -al /home/hadoop/bdp/application/aaintdatapipeline/


In [0]:


import os

print os. environ


In [0]:


# Copyright (c) 2018 App Annie Inc. All rights reserved.

"""
DB Check modules
"""
import unittest

import datetime
import croniter
from sqlalchemy.dialects.postgresql import psycopg2

from aaintdatapipeline.application.app_qa.common.db_check_utils import query
from aaintdatapipeline.application.app_qa.conf.settings import CITUS_MKT_NAME
from aaintdatapipeline.application.app_qa.conf.settings_local_prod import CITUS_AA_CITUS_DB_NAME, \
    CITUS_AA_CITUS_DB_ACCESS_ID, CITUS_AA_CITUS_DB_HOSTS, CITUS_AA_CITUS_DB_SECRET_KEY
from aaintdatapipeline.application.app_qa.db_check_v1.pyspark_test import PySparkTest
from aaintdatapipeline.core.conf import Conf
from aaintdatapipeline.core.fs.device import S3Bucket, specified_bucket
import zlib
import pandas as pd


# UTILS
def rank_bucket(bucket_str):
    """
    Get Bucket Object from bucket string
    :param bucket_str: string like: s3://xxx.xxx
    :type bucket_str: str
    :return: bucket_object
    :rtype: bucket_object
    """
    conf = Conf(
        bucket_name=bucket_str,
        bucket_class=S3Bucket
    )
    return specified_bucket(conf)


class AppStoreRankRawData():
    bucket_name = ""
    bucket_path = ""
    data_split_str = ""
    rank_list_split_str = ""
    rank_split_str = ""
    accept_feeds = []
    country_code_mapping = {}
    category_id_mapping = {}

    def get_raw_data_by(self, date):
        """
        :return: raw_data_frame
        :rtype: list_of_dic
        raw_data:
        _________________________________________________________________________________
        |    date    |   country_id   |  category_id  |   feed_id   |   rank (app_id)   |
        |------------|----------------|---------------|-------------|-------------------|
        | 2019-04-27 | 143441(bigint) |   6016 (int)  |   0 (int)   | 376510438(bigint) |
        ---------------------------------------------------------------------------------
        unified_data:
        _____________________________________________________________________________________
        |  country_code  |   category_id   |       app_id       | feed_name (free_download) |
        |----------------|-----------------|--------------------|---------------------------|
        |      'US'      | 100026 (bigint) | 376510438 (bigint) |    25 (int) (app_rank)    |
        -------------------------------------------------------------------------------------
        """
        path = "{_bucket_path}/{_date}/23/".format(_date=date, _bucket_path=self.bucket_path)
        bucket = rank_bucket(self.bucket_name)
        columns = ['date', 'country_code', 'category_id', 'metrics', 'app_rank_list']

        _raw_data_list = []
        df_list = []
        for key in bucket.list(path):
            _raw_data = zlib.decompress(bucket.get(key))
            for _line in _raw_data.splitlines():
                line_data = _line.split(self.data_split_str)
                app_rank_list = line_data.split(self.rank_list_split_str)
                if self.rank_split_str:
                    app_rank_list = [rank.split(self.rank_split_str)[1] for rank in app_rank_list]
                _raw_data_list.append(app_rank_list)
            _df = pd.DataFrame(_raw_data_list, columns=columns, )
            df_list.append(_df)
        return pd.concat(df_list)


class IPhoneRaw(AppStoreRankRawData):
    bucket_name = "prod_appannie_ios"
    bucket_path = "country-ranks"
    data_split_str = "\t"
    rank_split_str = " "
    accept_feeds = [0, 1, 2]


class IPadRaw(AppStoreRankRawData):
    bucket_name = "prod_appannie_ios"
    bucket_path = "country-ranks"
    data_split_str = "\t"
    rank_split_str = " "
    accept_feeds = [101, 100, 102]


class MacRaw(AppStoreRankRawData):
    bucket_name = "prod_appannie_ios"
    bucket_path = "mac/country-ranks"
    data_split_str = "\t"
    rank_split_str = " "
    accept_feeds = [200, 201, 202]


class AppleTvRaw(AppStoreRankRawData):
    bucket_name = "prod_appannie_appletv"
    bucket_path = "country-ranks"
    data_split_str = ","
    rank_split_str = " "
    accept_feeds = [0, 1, 2]


class AmazonRaw(AppStoreRankRawData):
    bucket_name = "prod_appannie_appletv"
    bucket_path = "country-ranks"
    data_split_str = ","
    rank_split_str = " "
    accept_feeds = [0, 1, 2]


class GooglePlayRaw(AppStoreRankRawData):
    bucket_name = "prod_appannie_appletv"
    bucket_path = "country-ranks"
    data_split_str = ","
    rank_split_str = " "
    accept_feeds = [0, 1, 2]


def get_app_rank_cases(params):
    project_name = params['project_name']
    if project_name == 'All_Products':
        check_date = datetime.date.today() - datetime.timedelta(days=2)


# CONSTANTS
APP_STORE_RANK_METRICS = ["free_download", "new_paid_download", "revenue", "paid_download", "new_free_download"]
citus_dsn = (
    "dbname='{db}' user='{user}' password='{password}' "
    "host='{host}' port='{port}'".format(
        db=CITUS_AA_CITUS_DB_NAME,
        user=CITUS_AA_CITUS_DB_ACCESS_ID,
        host=CITUS_AA_CITUS_DB_HOSTS[0][0],
        password=CITUS_AA_CITUS_DB_SECRET_KEY,
        port=CITUS_AA_CITUS_DB_HOSTS[0][1]
    )
)


class AppleTvUnified():
    bucket_name = "prod_appannie_appletv"
    bucket_path = "country-ranks"
    data_split_str = ","
    rank_split_str = " "
    accept_feeds = [0, 1, 2]
    s3_path = "s3://b2c-prod-data-pipeline-unified-store-free/unified/app-tech.store.app-rank.v1/fact/"

    def get(self, date, device_code, country_code, category_id):
        df = spark.read.parquet("date={}/device_code={}/".format(date, device_code))
        return df.filter("country_code='{country_code}' and category_id='{category_id}'".format(
            country_code=country_code, category_id=category_id)).toPandas()


class AppleTvDB():
    schema = "prod_appannie_appletv"
    table = "country-ranks"
    device_code = 'tv-os-tv'

    def get(self, date, device_code, country_code, category_id):
        sql = "SELECT * from {schema}.{table} where date ='' and "
        result = query(citus_dsn, sql)
        return pd.DataFrame(result)

    def query(dsn, sql):
        with psycopg2.connect(dsn) as conn:
            conn.autocommit = True
            with conn.cursor() as cur:
                cur.execute(sql)
                df = pd.DataFrame(cur.fetchall())
                df.columns = cur.keys()
                conn.commit()
        return df


# CASES

class AppStoreRankDailyTest(PySparkTest):
    check_date = ''
    trigger_date = ''
    routing_config = ('* 9 * * *', 1)

    @classmethod
    def setUpClass(cls):
        super(PySparkTest, cls).setUpClass()

    def setUp(self):
        super(PySparkTest, self).setUp()
        self.trigger_date = datetime.datetime.utcnow()
        self.check_date = self._get_date_from_refresh_routing_config(self.routing_config, self.trigger_date)

    @unittest.skip
    def test_etl_process(self):
        tv_raw_df = AppleTvRaw().get_raw_data_by(self.check_date)
        tv_unified_df = AppleTvUnified().get(self.check_date, "tv-os-tv", "US", 300000)
        tv_db_df = AppleTvDB().get(self.check_date, "tv-os-tv", "US", 300000)

    @unittest.skip
    def test_completeness_country(self):
        pass

    @unittest.skip
    def test_completeness_category(self):
        pass

    @unittest.skip
    def test_completeness_rank(self):
        pass

    @unittest.skip
    def test_mode_rank(self):
        pass

    def _get_date_from_refresh_routing_config(self, config, trigger_date=None):
        """
        return the date of : <days_delta> ago from previous scheduled date&time of <cron_time>.
        e.g.
        config = ("0 9 * * *", 1), today is 2019-10-27 8:00
        so previous scheduled date&time is 2019-10-26 9:00
        will return "2019-10-25"

        :param config: config format: (<cron_time>, <days_delta>)
        :type config: tuple
        the cron_time please refer to https://support.acquia.com/hc/en-us/articles/360004224494-Cron-time-string-format
        the days_delta is the days ago from the expected date.
        :return: date string of "%Y-%m-%d"
        :type return: str
        """
        schedule, days_delta = config
        # here use UTC now
        cron = croniter.croniter(schedule, trigger_date or datetime.datetime.utcnow())
        date = cron.get_prev(datetime.datetime) - datetime.timedelta(days=days_delta)
        return date.strftime("%Y-%m-%d")




In [0]:
%%sh
