In [0]:

"""
DB Check modules
"""

import unittest
import datetime
from dateutil.relativedelta import relativedelta

from applications.db_check_v1.common.db_check_utils import _get_date_from_refresh_routing_config

from collections import defaultdict
import psycopg2
import datetime
spark.sparkContext.addPyFile("/home/hadoop/bdp/application/libs/python/dependencies.zip")
import aaplproxy
from aadatapipelinecore.core.urn import Urn
from aaplproxy.da.local_sqlrunner import LocalSqlRunner
from aadatapipelinecore.core.utils.module import application_settings
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql import functions as F

def get_store_db_data(sql):
    citus_dsn_ = (
        "dbname='{db}' user='{user}' password='{password}' "
        "host='{host}' port='{port}'".format(
            db="aa_store_db",
            user="citus_bdp_prod_app_int_qa",
            host="10.2.6.141",
            password="wZw8cfBuuklIskVG",
            port=5432
        )
    )
    db_data = query(citus_dsn_, sql)
    return db_data
def query(dsn, sql):
    with psycopg2.connect(dsn) as conn:
        conn.autocommit = True
        with conn.cursor() as cur:
            cur.execute(sql)
            result = cur.fetchall()
            conn.commit()
    return result

# def get_store_db_data(sql):
#     result = query(citus_settings("store"), sql)
#     return result


def get_start_end_date_list(granularity, date):
    end = date
    if granularity == 'weekly':
        start = datetime.datetime.strptime(date, '%Y-%m-%d') - relativedelta(weeks=1) + relativedelta(
            days=1)
        start = datetime.datetime.strftime(start, '%Y-%m-%d')
    elif granularity == 'monthly':
        start = date[:7] + str('-01')
    elif granularity == 'quarterly':
        start = datetime.datetime.strptime(date, '%Y-%m-%d') - relativedelta(months=2)
        start = datetime.datetime.strftime(start, '%Y-%m') + str('-01')
    elif granularity == 'yearly':
        start = date[:4] + str('-01-01')
    return start, end


def get_check_date_granularity(date):
    granularity_list = list()
    quarterly_date_list = ['03-31', '06-30', '09-30', '12-31']
    yearly_date_list = ['12-31']
    check_date = datetime.datetime.strptime(date, '%Y-%m-%d')
    if check_date.isoweekday() == 6:
        granularity_list.append("weekly")
    if last_day_of_month(check_date) == date:
        granularity_list.append("monthly")
    if date[-5:] in quarterly_date_list:
        granularity_list.append("quarterly")
    if date[-5:] in yearly_date_list:
        granularity_list.append("yearly")

    return granularity_list


def last_day_of_month(check_month):
    next_month = check_month.replace(day=28) + datetime.timedelta(days=4)
    return (next_month - datetime.timedelta(days=next_month.day)).strftime("%Y-%m-%d")





class TestStoreDownloadRevenue(unittest.TestCase):
    trigger_date_config = ("9 * * * *", 2)

    sql_daily_category = '''SELECT 
                               Sum(est_free_app_download), 
                               Sum(est_paid_app_download), 
                               Sum(est_revenue) 
                        FROM   (SELECT app_id, 
                                       device_code, 
                                       country_code, 
                                       category_id, 
                                       Sum(est_free_app_download) AS est_free_app_download, 
                                       Sum(est_paid_app_download) AS est_paid_app_download, 
                                       Sum(est_revenue)           AS est_revenue 
                                FROM   store.store_est_category_fact_v1 
                                WHERE  date BETWEEN '{}' AND '{}' 
                                GROUP  BY app_id, 
                                          device_code, 
                                          country_code, 
                                          category_id) AS t;  '''

    sql_daily_est = '''SELECT 
                               Sum(est_free_app_download), 
                               Sum(est_paid_app_download), 
                               Sum(est_revenue) 
                        FROM   (SELECT app_id, 
                                       device_code, 
                                       country_code, 
                                       Sum(est_free_app_download) AS est_free_app_download, 
                                       Sum(est_paid_app_download) AS est_paid_app_download, 
                                       Sum(est_revenue)           AS est_revenue 
                                FROM   store.store_est_fact_v2 
                                WHERE  date BETWEEN '{}' AND '{}' 
                                GROUP  BY app_id, 
                                          device_code, 
                                          country_code) AS t;  '''

    sql_pre_agg_category='''SELECT  
                                   Sum(est_free_app_download), 
                                   Sum(est_paid_app_download), 
                                   Sum(est_revenue) 
                            FROM   store.store_est_category_t_{}_fact_v1 
                            WHERE  date BETWEEN '{}' AND  '{}' 
                            AND    granularity='{}' and country_code='US' '''


    sql_pre_agg_est='''SELECT 
                               Sum(est_free_app_download), 
                               Sum(est_paid_app_download), 
                               Sum(est_revenue) 
                        FROM   store.store_est_t_{}_fact_v2 
                        WHERE  date BETWEEN '{}' AND  '{}' 
                        AND    granularity='{}' and country_code='US' '''

    def setUp(self):
        self.failed_ids = defaultdict(list)

    def test_store_download_revenue_pre_agg(self):
        trigger_date_config = ("9 * * * *", 5)
        check_date_str, _ = _get_date_from_refresh_routing_config(trigger_date_config,"test")
        check_date_str = str(check_date_str)
        check_list = [{x: get_start_end_date_list(x, check_date_str)} for x in get_check_date_granularity(check_date_str)]
        print check_date_str
        print check_list
        for agg_data in check_list:
            for key, value in agg_data.items():
                print key,value
                granularity = key
                start_date = value[0]
                end_date = value[1]
                est_daily_agg_result = get_store_db_data(self.sql_daily_est.format(start_date, end_date))[0]
                est_pre_agg_result = get_store_db_data(self.sql_pre_agg_est.format(granularity[0], start_date, end_date, granularity))[0]

                category_daily_agg_result = get_store_db_data(self.sql_daily_category.format(start_date, end_date))[0]
                category_pre_agg_result = get_store_db_data(self.sql_pre_agg_category.format(granularity[0], start_date, end_date, granularity))[0]

                # print est_daily_agg_result
                # print est_pre_agg_result
                est_metric = ['est_free_app_download', 'est_paid_app_download', 'est_revenue_download']
                result_est = [ {est_metric: (long(daily_est) , long(pre_agg_est) )} for daily_est, pre_agg_est, est_metric in zip(est_daily_agg_result , est_pre_agg_result, est_metric) ]
                # print result_est
                [ type(x) for x in result_est ]

                failed_ids = [ (k,v) for k, v in x.iteritems() if v[0] != v[1] for x in result_est ]
                self.assertTrue(len(failed_ids) == 0, failed_ids)
                
                
                category_metric = ['category_free_app_download', 'category_paid_app_download', 'category_revenue_download']
                category_result = [ {category_metric: (long(daily_category) , long(pre_agg_category) )} for daily_category, pre_agg_category, category_metric in zip(category_daily_agg_result , category_pre_agg_result, category_metric) ]
                [ type(x) for x in category_result ]
                failed_ids =  [(k,v) for k, v in x.iteritems() if v[0] != v[1] for x in category_result ]
                self.assertTrue(len(failed_ids) == 0, "pre_agg metric %s failed" % failed_ids)

                


unittest.main(argv=[''], verbosity=2, exit=False)

In [0]:

"""
get date:  [ [month, [days]], [month, [days]], [month, [days]], ....... ]
"""
def get_date_list(start_date, end_date, freq="D"):
    """
    freq:   D: calendar day frequency
            M: month end frequency
            MS: month start frequency
            A, Y: year end frequency
            AS, YS: year start frequency
    """
    date_list = [x.strftime('%Y-%m-%d') for x in list(pd.date_range(start=start_date, end=end_date, freq=freq))]
    return date_list

print get_date_list("2010-01-01","2010-03-01")

In [0]:

from bdce.common.utils import update_application_code
update_application_code(
    spark, role="BDP-PROD-APP-INT-QA", application_name="qa-data-db-check-fiona"
)

In [0]:

import datetime
from dateutil.relativedelta import relativedelta

def get_granularity(date):
    granularity_list = list()
    
    quarterly_date_list = ['03-31', '06-30', '09-30', '12-31']
    yearly_date_list = ['12-31']

    check_date = datetime.datetime.strptime(date, '%Y-%m-%d')
    if check_date.isoweekday() == 6:
        granularity_list.append("weekly")
    if last_day_of_month(check_date) == date:
        granularity_list.append("monthly")
    if date[-5:] in quarterly_date_list:
        granularity_list.append("quarterly")
    if date[-5:] in yearly_date_list:
        granularity_list.append("yearly")

        
    return granularity_list


def last_day_of_month(check_month):
    next_month = check_month.replace(day=28) + datetime.timedelta(days=4)
    return (next_month - datetime.timedelta(days=next_month.day)).strftime("%Y-%m-%d")


print get_granularity("2020-02-29")



def get_date_list(granularity, date):
    end = date
    if granularity == 'weekly':
        start = datetime.datetime.strptime(date, '%Y-%m-%d') - relativedelta(weeks=1) + relativedelta(
            days=1)
        start = datetime.datetime.strftime(start, '%Y-%m-%d')
    elif granularity == 'monthly':
        start = date[:7] + str('-01')
    elif granularity == 'quarterly':
        start = datetime.datetime.strptime(date, '%Y-%m-%d') - relativedelta(months=2)
        start = datetime.datetime.strftime(start, '%Y-%m') + str('-01')
    elif granularity == 'yearly':
        start = date[:4] + str('-01-01')
    return start, end

t = [ {x :get_date_list(x, '2016-12-31' )} for x in get_granularity("2016-12-31")]
for x in t:
    for key,value in x.items():
        print key,value

In [0]:

import unittest
import datetime
from collections import defaultdict
from dateutil.relativedelta import relativedelta

from applications.db_check_v1.common.db_check_utils import _get_date_from_refresh_routing_config
from applications.db_check_v1.common.base_test import PipelineTest

from collections import defaultdict
import psycopg2
import datetime
spark.sparkContext.addPyFile("/home/hadoop/bdp/application/libs/python/dependencies.zip")
import aaplproxy
from aadatapipelinecore.core.urn import Urn
from aaplproxy.da.local_sqlrunner import LocalSqlRunner
from aadatapipelinecore.core.utils.module import application_settings
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql import functions as F


def get_store_db_data(sql):
    citus_dsn_ = (
        "dbname='{db}' user='{user}' password='{password}' "
        "host='{host}' port='{port}'".format(
            db="aa_store_db",
            user="citus_bdp_prod_app_int_qa",
            host="10.2.6.141",
            password="wZw8cfBuuklIskVG",
            port=5432
        )
    )
    db_data = query(citus_dsn_, sql)
    return db_data
def query(dsn, sql):
    with psycopg2.connect(dsn) as conn:
        conn.autocommit = True
        with conn.cursor() as cur:
            cur.execute(sql)
            result = cur.fetchall()
            conn.commit()
    return result

def get_start_end_date_list(granularity, date):
    end = date
    if granularity == 'weekly':
        start = datetime.datetime.strptime(date, '%Y-%m-%d') - relativedelta(weeks=1) + relativedelta(
            days=1)
        start = datetime.datetime.strftime(start, '%Y-%m-%d')
    elif granularity == 'monthly':
        start = date[:7] + str('-01')
    elif granularity == 'quarterly':
        start = datetime.datetime.strptime(date, '%Y-%m-%d') - relativedelta(months=2)
        start = datetime.datetime.strftime(start, '%Y-%m') + str('-01')
    elif granularity == 'yearly':
        start = date[:4] + str('-01-01')
    return start, end


def get_check_date_granularity(date):
    granularity_list = list()
    quarterly_date_list = ['03-31', '06-30', '09-30', '12-31']
    yearly_date_list = ['12-31']
    check_date = datetime.datetime.strptime(date, '%Y-%m-%d')
    if check_date.isoweekday() == 6:
        granularity_list.append("weekly")
    if last_day_of_month(check_date) == date:
        granularity_list.append("monthly")
    if date[-5:] in quarterly_date_list:
        granularity_list.append("quarterly")
    if date[-5:] in yearly_date_list:
        granularity_list.append("yearly")

    return granularity_list


def last_day_of_month(check_month):
    next_month = check_month.replace(day=28) + datetime.timedelta(days=4)
    return (next_month - datetime.timedelta(days=next_month.day)).strftime("%Y-%m-%d")


class TestStoreDownloadRevenuePreAgg(PipelineTest):
    trigger_date_config = ("13 * * * *", 6)

    sql_daily_category = "SELECT Sum(est_free_app_download), Sum(est_paid_app_download), Sum(est_revenue) " \
                         "FROM (SELECT app_id, device_code, country_code, category_id, " \
                         "Sum(est_free_app_download) AS est_free_app_download, " \
                         "Sum(est_paid_app_download) AS est_paid_app_download, " \
                         "Sum(est_revenue) AS est_revenue " \
                         "FROM store.store_est_category_fact_v1 " \
                         "WHERE date BETWEEN '{}' AND '{}' " \
                         "GROUP BY app_id, device_code, country_code, category_id) AS t;"

    sql_daily_est = "SELECT Sum(est_free_app_download), Sum(est_paid_app_download), Sum(est_revenue) " \
                    "FROM (SELECT app_id, device_code, country_code, " \
                    "Sum(est_free_app_download) AS est_free_app_download, " \
                    "Sum(est_paid_app_download) AS est_paid_app_download, " \
                    "Sum(est_revenue) AS est_revenue FROM store.store_est_fact_v2 " \
                    "WHERE date BETWEEN '{}' AND '{}' GROUP BY app_id, device_code, country_code) AS t;"

    sql_pre_agg_category = "SELECT Sum(est_free_app_download), Sum(est_paid_app_download), " \
                           "Sum(est_revenue) FROM store.store_est_category_t_{}_fact_v1 " \
                           "WHERE date BETWEEN '{}' AND '{}' AND granularity='{}'"

    sql_pre_agg_est = "SELECT Sum(est_free_app_download), Sum(est_paid_app_download), Sum(est_revenue) " \
                      "FROM store.store_est_t_{}_fact_v2 " \
                      "WHERE date BETWEEN '{}' AND '{}' AND granularity='{}'"

    failed_ids = defaultdict(list)

    def test_store_download_revenue_pre_agg(self):

        check_list = [{x: get_start_end_date_list(x, self.check_date_str)} for x in
                      get_check_date_granularity(self.check_date_str)]

        print check_list
        for agg_data in check_list:
            for key, value in agg_data.items():
                print key, value
                granularity = key
                start_date = value[0]
                end_date = value[1]
                est_daily_agg_result = get_store_db_data(self.sql_daily_est.format(start_date, end_date))[0]
                est_pre_agg_result = get_store_db_data(
                    self.sql_pre_agg_est.format(granularity[0], start_date, end_date, granularity))[0]

                category_daily_agg_result = \
                    get_store_db_data(self.sql_daily_category.format(start_date, end_date))[0]
                category_pre_agg_result = get_store_db_data(
                    self.sql_pre_agg_category.format(granularity[0], start_date, end_date, granularity))[0]

                est_metric = ['est_free_app_download', 'est_paid_app_download', 'est_revenue_download']
                result_est = [{est_metric: (long(daily_est), long(pre_agg_est))} for
                              daily_est, pre_agg_est, est_metric in
                              zip(est_daily_agg_result, est_pre_agg_result, est_metric)]
                print [type(x) for x in result_est]

                failed_ids = [(k, v) for k, v in x.iteritems() if v[0] != v[1] for x in result_est]
                self.assertTrue(len(failed_ids) == 0, failed_ids)

                category_metric = ['category_free_app_download', 'category_paid_app_download',
                                   'category_revenue_download']
                category_result = [{category_metric: (long(daily_category), long(pre_agg_category))} for
                                   daily_category, pre_agg_category, category_metric in
                                   zip(category_daily_agg_result, category_pre_agg_result, category_metric)]
                print [type(x) for x in category_result]
                failed_ids = [(k, v) for k, v in x.iteritems() if v[0] != v[1] for x in category_result]
                self.assertTrue(len(failed_ids) == 0, "pre_agg metric %s failed" % failed_ids)
                
                
                
class TestStoreDownloadRevenue(PipelineTest):
    trigger_date_config = ("9 * * * *", 2)
    hot_country_list = [ 'AU', 'BR', 'CN', 'FR', 'DE', 'IN', 'JP', 'KR', 'GB', 'US', 'WW']

    sql_est_count = '''SELECT Count(1) FROM store.store_est_fact_v2 WHERE  date = '{}' '''

    sql_category_count = "SELECT Count(1) FROM store.store_est_category_fact_v1 WHERE date = '{}'"

    sql_est_country_count = "SELECT DISTINCT device_code, Count(country_code) " \
                            "FROM (SELECT DISTINCT device_code, country_code " \
                            "FROM store.store_est_fact_v2 WHERE date = '{}' And country_code in ('{}') ) AS prod " \
                            "GROUP BY device_code ORDER By device_code"

    sql_category_country_count = "SELECT DISTINCT device_code, Count(country_code) " \
                                 "FROM (SELECT DISTINCT device_code, country_code " \
                                 "FROM store.store_est_category_fact_v1 WHERE date = '{}' And country_code in ('{}') ) AS prod " \
                                 "GROUP BY device_code ORDER By device_code"

    sql_download_att_init_value = "SELECT Sum(est_free_app_download) + Sum(est_paid_app_download) " \
                                  "AS est_app_download, Sum(est_organic_download) AS est_organic_download, " \
                                  "Sum(est_paid_download) AS est_paid_download " \
                                  "FROM store.store_est_fact_v2 WHERE date = '{}'"

    failed_ids = defaultdict(list)

    def test_store_download_revenue(self):

        db_est_count = get_store_db_data(self.sql_est_count.format(self.check_date_str))[0][0]
        db_category_count = get_store_db_data(self.sql_category_count.format(self.check_date_str))[0][0]
        db_download_attr_init = get_store_db_data(self.sql_download_att_init_value.format(self.check_date_str))[0]
        db_est_country_count = get_store_db_data(self.sql_est_country_count.format(self.check_date_str, "','".join(self.hot_country_list)))
        db_category_country_count = get_store_db_data(self.sql_category_country_count.format(self.check_date_str,"','".join(self.hot_country_list)))



        self.assertNotEqual(db_est_count, 0, "est data is not ready for date {}".format(self.check_date_str))
        self.assertNotEqual(db_category_count, 0,
                            "category data is not ready for date {}".format(self.check_date_str))


        self.assertEqual(db_download_attr_init[0], db_download_attr_init[1],
                         "download attr organic paid init value != free_app_download + paid_app_download")
        self.assertEqual(db_download_attr_init[2], 0,
                         "download attribution paid download init data is not equals to 0")
                         

        self.assertTrue(db_est_country_count[0][1]==db_est_country_count[0][1]==db_est_country_count[2][1]  )
        self.assertEqual(db_est_country_count[0][1], len(self.hot_country_list) )

        self.assertTrue(db_category_country_count[0][1]==db_category_country_count[0][1]==db_category_country_count[2][1]  )
        self.assertEqual(db_category_country_count[0][1], len(self.hot_country_list) )



class DnaChangeLog(object):
    """
    Get data from unified s3 bucket, then return a DataFrame
    """
    _unified_s3_path = "s3://b2c-prod-data-pipeline-unified-dna/unified/dna.mapping-log.v1/" \
                       "dimension/update_date={}"

    def __init__(self, spark):
        self.spark = spark

    def get(self, date):
        """
        :param date:  update date in s3 bucket path
        :type date: string
        :return: unified_df
        :rtype: DataFrame
        """
        unified_df = self.spark.read.parquet(self._unified_s3_path.format("2020-07-20")).select("app_id",
                                                                                        "company_id",
                                                                                        "parent_company_id",
                                                                                        "publisher_id").distinct()
        return unified_df


class StoreEstDaily(object):
    """
    Get data from unified s3 bucket, then return a DataFrame
    """
    _unified_s3_path = "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-dna-log.v1/fact/"

    def __init__(self, spark):
        self.spark = spark

    def get(self, date):
        """
        :param date:  update date in s3 bucket path
        :type date: string
        :return: unified_df
        :rtype: DataFrame
        """
        unified_df = self.spark.read.format('delta').load(self._unified_s3_path).where("granularity='daily' and date='{}' ".format(date)).select("app_id",
                                                                                        "company_id",
                                                                                        "parent_company_id",
                                                                                        "publisher_id").distinct()
        return unified_df

class TestStoreDownloadRevenue_new(PipelineTest):
    trigger_date_config = ("0 9 * * *", 1)

    def test_store_download_revenue_dna_change_log_accuracy(self):
        dna_unified = DnaChangeLog(self.spark).get(self.check_date)
        est_unified = StoreEstDaily(self.spark).get(self.check_date)
        dna_unified.createOrReplaceTempView("dna_unified")
        est_unified.createOrReplaceTempView("est_unified")
        self.spark.sql(
            "select app_id, publisher_id, coalesce(company_id, 0) as company_id, coalesce(parent_company_id, 0) as parent_company_id  from est_unified "
            "except select app_id, publisher_id,  coalesce(company_id, 0) as company_id, coalesce(parent_company_id, 0) as parent_company_id from dna_unified").show()
            
        self.spark.sql(
            "select * from ( select app_id, publisher_id, coalesce(company_id, 0) as company_id, coalesce(parent_company_id, 0) as parent_company_id  from est_unified "
            "except select app_id, publisher_id,  coalesce(company_id, 0) as company_id, coalesce(parent_company_id, 0) as parent_company_id from dna_unified ) where publisher_id !=0 ").show()
    
        # self.spark.sql(
        #     "select app_id, publisher_id, company_id, parent_company_id from dna_unified "
        #     "except select app_id, publisher_id, company_id, parent_company_id from est_unified").show()

send_message()


In [0]:
%%sh

cat /tmp/db_check.log




In [0]:

from aadatapipelinecore.core.utils.spark import eject_all_caches

eject_all_caches(spark)


In [0]:

"""
Test DNA tag/hq/unified_app tables
"""
import unittest
import datetime
from collections import defaultdict


from applications.db_check_v1.common.db_check_utils import _get_date_from_refresh_routing_config
from applications.db_check_v1.common.base_test import PipelineTest
from applications.db_check_v1.common.html_report_test_runner import HTMLTestRunner



from aadatapipelinecore.core.monitoring.pipeline_monitor import running, fail, task_success
from aadatapipelinecore.core.urn import Urn
from aadatapipelinecore.core.utils.identifier import package_id
from aadatapipelinecore.core.utils.spark import canned_spark, stop


from applications.db_check_v1.common.html_report_test_runner import HTMLTestRunner
from applications.db_check_v1.common.utils import send_db_check_email

import psycopg2
import datetime
from aadatapipelinecore.core.urn import Urn
from aadatapipelinecore.core.utils.module import application_settings
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql import functions as F

def get_dna_db_data(sql):
    citus_dsn_ = (
        "dbname='{db}' user='{user}' password='{password}' "
        "host='{host}' port='{port}'".format(
            db="aa_store_db",
            user="citus_bdp_prod_app_int_qa",
            host="10.2.6.141",
            password="wZw8cfBuuklIskVG",
            port=5432
        )
    )
    db_data = query(citus_dsn_, sql)
    return db_data
def query(dsn, sql):
    with psycopg2.connect(dsn) as conn:
        conn.autocommit = True
        with conn.cursor() as cur:
            cur.execute(sql)
            result = cur.fetchall()
            conn.commit()
    return result


class DnaHQUnifiedData(object):
    """
    Get data from unified s3 bucket, then return a DataFrame
    """
    _unified_s3_path = "s3://b2c-prod-data-pipeline-unified-dna/unified/dna.company-hq-mapping.v1/" \
                       "dimension/_update_date={}/_update_hour={}"

    def __init__(self, spark):
        self.spark = spark

    def get(self, date, hour):
        """
        :param date:  update date in s3 bucket path
        :type date: string
        :param hour:  update hour in s3 bucket
        :type hour: string
        :return: unified_df
        :rtype: DataFrame
        """
        unified_df = self.spark.read.parquet(self._unified_s3_path.format(date, hour))
        return unified_df


class DnaTagUnifiedData(object):
    """
    Get data from unified s3 bucket, then return a DataFrame
    """
    _unified_s3_path = "s3://b2c-prod-data-pipeline-unified-dna/unified/dna.app-tag-mapping.v1/" \
                       "dimension/_update_date={}/_update_hour={}"

    def __init__(self, spark):
        self.spark = spark

    def get(self, date, hour):
        """
        :param date:  update date in s3 bucket path
        :type date: string
        :param hour:  update hour in s3 bucket
        :type hour: string
        :return: unified_df
        :rtype: DataFrame
        """
        unified_df = self.spark.read.parquet(self._unified_s3_path.format(date, hour))
        return unified_df


class DnaUnifiedAppSingleAppData(object):
    """
    Get data from unified s3 bucket, then return a DataFrame
    """
    _unified_s3_path = "s3://b2c-prod-data-pipeline-unified-dna/unified/" \
                       "dna.in-unified-product-product-mapping.v1/dimension/_update_date={}/_update_hour={}"

    def __init__(self, spark):
        self.spark = spark

    def get(self, date, hour):
        """
        :param date:  update date in s3 bucket path
        :type date: string
        :param hour:  update hour in s3 bucket
        :type hour: string
        :return: unified_df
        :rtype: DataFrame
        """
        unified_df = self.spark.read.parquet(self._unified_s3_path.format(date, hour)).where("status=1")
        return unified_df


# def get_dna_db_data(sql):
#     result = query(citus_settings("store"), sql)
#     return result


# class TestDNAHQ(PipelineTest):
#     trigger_date_config = ("0 2,14 * * *", 0)

#     sql_hq = "select count(1) from dna.dna_company_hq_dim_v1"

#     def setUp(self):
#         super

#     def test_dna_hq_completeness(self):
#         self.check_date = self._get_check_date_from_routing_config(self.trigger_datetime)
#         print type(self.check_date)
#         day = self.check_date_str
#         hour = self.check_date.strftime("%-H")
#         unified_df = DnaHQUnifiedData(self.spark).get(day, hour)
#         unified_hq_count = unified_df.count()
#         self.assertNotEqual(unified_hq_count, 0, "DNA HQ unified data is not generated!")
#         db_hq_count = get_dna_db_data(self.sql_hq)[0][0]
#         print unified_hq_count, db_hq_count
#         self.assertEqual(unified_hq_count, db_hq_count,
#                          "DNA HQ unified data is not equals to db data! unified is {}, db is {}".format(
#                              unified_hq_count, db_hq_count))


class TestDNAUnifiedSingleAPP(PipelineTest):
    trigger_date_config = ("0 1 * * *", 0)
    sql_unified_single_app = "select count(1) from dna.dna_app_unified_dim_v2"

    def test_dna_unified_app_single_app(self):
        # self.check_date = self._get_check_date_from_routing_config(self.trigger_datetime)
        print type(self.check_date)
        day = self.check_date_str
        hour = self.check_date.strftime("%-H")
        unified_df = DnaUnifiedAppSingleAppData(self.spark).get(day, hour)
        unified_app_count = unified_df.count()
        self.assertNotEqual(unified_app_count, 0, "unified data is not generated!")
        db_unified_app_count = get_dna_db_data(self.sql_unified_single_app)[0][0]
        self.assertEqual(unified_app_count, db_unified_app_count,
                         "Unified/single app data is not equals to db data! unified is {}, db is {}".format(
                             unified_app_count, db_unified_app_count))



send_message()


In [0]:


import unittest
import datetime
from collections import defaultdict
from dateutil.relativedelta import relativedelta


from applications.db_check_v1.common.db_check_utils import _get_date_from_refresh_routing_config
from applications.db_check_v1.common.base_test import PipelineTest
from applications.db_check_v1.common.html_report_test_runner import HTMLTestRunner



from aadatapipelinecore.core.monitoring.pipeline_monitor import running, fail, task_success
from aadatapipelinecore.core.urn import Urn
from aadatapipelinecore.core.utils.identifier import package_id
from aadatapipelinecore.core.utils.spark import canned_spark, stop


from applications.db_check_v1.common.html_report_test_runner import HTMLTestRunner
from applications.db_check_v1.common.utils import send_db_check_email

import psycopg2
import datetime
from aadatapipelinecore.core.urn import Urn
from aadatapipelinecore.core.utils.module import application_settings
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql import functions as F

# def get_store_db_data(sql):
#     citus_dsn_ = (
#         "dbname='{db}' user='{user}' password='{password}' "
#         "host='{host}' port='{port}'".format(
#             db="aa_store_db",
#             user="citus_bdp_prod_app_int_qa",
#             host="10.2.6.141",
#             password="wZw8cfBuuklIskVG",
#             port=5432
#         )
#     )
#     db_data = query(citus_dsn_, sql)
#     return db_data
   
   
def get_store_db_data(sql):
    citus_dsn_ = (
        "dbname='{db}' user='{user}' password='{password}' "
        "host='{host}' port='{port}'".format(
            db="metadb",
            user="app_meta_qa",
            host="b2b-prod-uds-storage-meta-db-new.crlexxwtzodp.us-east-1.rds.amazonaws.com",
            password="IEdwiF2hmqzdKK43",
            port=5432
        )
    )
    db_data = query(citus_dsn_, sql)
    return db_data
 
    
def query(dsn, sql):
    with psycopg2.connect(dsn) as conn:
        conn.autocommit = True
        with conn.cursor() as cur:
            cur.execute(sql)
            result = cur.fetchall()
            conn.commit()
    return result


def get_start_end_date_list(granularity, date):
    end = date
    if granularity == 'weekly':
        start = datetime.datetime.strptime(date, '%Y-%m-%d') - relativedelta(weeks=1) + relativedelta(
            days=1)
        start = datetime.datetime.strftime(start, '%Y-%m-%d')
    elif granularity == 'monthly':
        start = date[:7] + str('-01')
    elif granularity == 'quarterly':
        start = datetime.datetime.strptime(date, '%Y-%m-%d') - relativedelta(months=2)
        start = datetime.datetime.strftime(start, '%Y-%m') + str('-01')
    elif granularity == 'yearly':
        start = date[:4] + str('-01-01')
    return start, end
    
    
    
def get_date_list(start_date, end_date, freq="D"):
    """
    freq:   D: calendar day frequency
            M: month end frequency
            MS: month start frequency
            A, Y: year end frequency
            AS, YS: year start frequency
    """
    date_list = [x.strftime('%Y-%m-%d') for x in
                 list(pd.date_range(start=start_date, end=end_date, freq=freq))]
    return date_list


def get_start_end_date_list(granularity, date):
    end = date
    if granularity == 'weekly':
        start = datetime.datetime.strptime(date, '%Y-%m-%d') - relativedelta(weeks=1) + relativedelta(
            days=1)
        start = datetime.datetime.strftime(start, '%Y-%m-%d')
    elif granularity == 'monthly':
        start = date[:7] + str('-01')
    elif granularity == 'quarterly':
        start = datetime.datetime.strptime(date, '%Y-%m-%d') - relativedelta(months=2)
        start = datetime.datetime.strftime(start, '%Y-%m') + str('-01')
    elif granularity == 'yearly':
        start = date[:4] + str('-01-01')
    return start, end


def get_check_date_granularity(date):
    granularity_list = list()
    quarterly_date_list = ['03-31', '06-30', '09-30', '12-31']
    yearly_date_list = ['12-31']
    check_date = datetime.datetime.strptime(date, '%Y-%m-%d')
    if check_date.isoweekday() == 6:
        granularity_list.append("weekly")
    if last_day_of_month(check_date) == date:
        granularity_list.append("monthly")
    if date[-5:] in quarterly_date_list:
        granularity_list.append("quarterly")
    if date[-5:] in yearly_date_list:
        granularity_list.append("yearly")

    return granularity_list


def last_day_of_month(check_month):
    next_month = check_month.replace(day=28) + datetime.timedelta(days=4)
    return (next_month - datetime.timedelta(days=next_month.day)).strftime("%Y-%m-%d")



class TestStoreDownloadAttributionPreAgg(PipelineTest):
    trigger_date_config = ("0 8 * * 4", 5)
    sql_paid_download_count = '''SELECT distinct date FROM store.store_est_fact_v2 WHERE  date between '{}' and
                      '{}' and est_paid_app_download != 0 order by date desc'''

    def test_download_attribution_db_pre_agg_timeleness(self):
        print self.check_date
        self.check_date = self._get_check_date_from_routing_config(self.trigger_datetime)
        if self.check_date.isoweekday() == 7:
            start, end = get_start_end_date_list("weekly", self.check_date_str)
            print start, end
            db_est_count = get_store_db_data(self.sql_paid_download_count.format(start, end))
            date_list = [d[0].strftime('%Y-%m-%d') for d in db_est_count]
            self.assertEqual(len(date_list), 8,
                             "Miss days in download attr {}".format(' ,'.join(map(str, date_list))))


class UnifedDownloadAttribution(object):
    """
    Get data from unified s3 bucket, then return a DataFrame
    """
    _unified_s3_path = "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.download-attribution.v4/" \
                       "fact/"

    def __init__(self, spark):
        self.spark = spark

    def get(self, start, end):
        """
        :param start:  update date in s3 bucket path
        :type start: string
        :param end:  update hour in s3 bucket
        :type end: string
        :return: unified_df
        :rtype: DataFrame
        """
        unified_df = self.spark.read.format("delta").load(self._unified_s3_path).where(
            "granularity='daily' and  date between '{}' and '{}'  ".format(start, end))
        return unified_df


class TestStoreDownloadAttributionUnified(PipelineTest):
    trigger_date_config = ("0 8 * * 4", 9)

    def test_download_attribution_(self):
        print self.check_date
        self.check_date = self._get_check_date_from_routing_config(self.trigger_datetime)
        if self.check_date.isoweekday() == 6:
            start, end = get_start_end_date_list("weekly", self.check_date_str)
            UnifedDownloadAttribution(self.spark).get(start, end).createOrReplaceTempView("dwonlaod_attribution_share")
            date_list = self.spark.sql("select distinct date from dwonlaod_attribution_share").collect()
            date_list = [d[0].strftime('%Y-%m-%d') for d in date_list]
            self.assertEqual(len(date_list), 7, "Miss days in download attr {}".format(' ,'.join(map(str, date_list))))








class TestStoreDownloadRevenuePreAgg(PipelineTest):
    trigger_date_config = ("0 11 * * 4", 5)

    sql_download_att_daily = "SELECT Sum(est_free_app_download) + Sum(est_paid_app_download) , Sum(est_revenue) " \
                                  "AS est_app_download, Sum(est_organic_download) AS est_organic_download, " \
                                  "Sum(est_paid_download) AS est_paid_download " \
                                  "FROM store.store_est_fact_v2 WHERE date between '{}' and '{}' "


    sql_pre_agg_est = "SELECT Sum(est_free_app_download) + Sum(est_paid_app_download), Sum(est_revenue), " \
                      "sum(est_organic_download), sum(est_paid_download) FROM store.store_est_t_{}_fact_v2 " \
                      "WHERE date BETWEEN '{}' AND '{}' AND granularity='{}'"

    failed_ids = defaultdict(list)

    def test_store_download_attr_pre_agg(self):

        check_list = [{x: get_start_end_date_list(x, self.check_date_str)} for x in
                      get_check_date_granularity(self.check_date_str)]

        print check_list
        for agg_data in check_list:
            for key, value in agg_data.items():
                print key, value
                granularity = key
                start_date = value[0]
                end_date = value[1]
                est_daily_agg_result = get_store_db_data(self.sql_download_att_daily.format(start_date, end_date))[0]
                est_pre_agg_result = get_store_db_data(
                    self.sql_pre_agg_est.format(granularity[0], start_date, end_date, granularity))[0]

                print 'est_daily_agg_result', est_daily_agg_result
                print 'est_daily_agg_result', est_daily_agg_result


                est_metric = ['est_free_app_download', 'est_paid_app_download', 'est_revenue_download']
                result_est = [{est_metric: (long(daily_est), long(pre_agg_est))} for
                              daily_est, pre_agg_est, est_metric in
                              zip(est_daily_agg_result, est_pre_agg_result, est_metric)]
                print [type(x) for x in result_est]

                failed_ids = [(k, v) for k, v in x.iteritems() if v[0] != v[1] for x in result_est]
                self.assertTrue(len(failed_ids) == 0, failed_ids)


# def get_store_db_data(sql):
#     result = query(citus_settings("metadb"), sql)
#     return result


class TestStoreMetaDB(PipelineTest):
    trigger_date_config = ("13 * * * *", 6)

    sql_meta_db = "select table_name, app_stats -> 'latest_available_date' from meta where table_name like '%store_est_t_{}_fact%' " \
                  "and is_active=True limit 2;"

    failed_ids = defaultdict(list)

    def test_store_meta_db_accuracy(self):

        check_list = [{x: get_start_end_date_list(x, self.check_date_str)} for x in
                      get_check_date_granularity(self.check_date_str)]

        print 'meta db ', check_list
        for agg_data in check_list:
            for key, value in agg_data.items():
                print key, value
                granularity = key
                end_date = value[1]
                est_daily_agg_result = get_store_db_data(self.sql_meta_db.format(granularity[0]))[0]
                print est_daily_agg_result
                self.assertEquals(end_date, est_daily_agg_result[1],
                                  "meta db is not updated!, it should be {}, but current value is {}".format(
                                      end_date, est_daily_agg_result[1]))
                print 'pass'

class TestStoreDownloadAttributionDB(PipelineTest):
    trigger_date_config = ("0 8 * * 4", 5)
    # sql_paid_download_count = '''SELECT distinct date FROM store.store_est_fact_v2 WHERE  date between '{}' and
    #                   '{}' and est_paid_app_download != 0 order by date desc  '''

    def test_download_attribution_db_timeleness(self):
        print self.check_date
        self.check_date = self._get_check_date_from_routing_config(self.trigger_datetime)
        if self.check_date.isoweekday() == 6:
            print 'need to check startuday'
        #     start, end = get_start_end_date_list("weekly", self.check_date_str)
        #     print start, end
        #     db_est_count = get_store_db_data(self.sql_paid_download_count.format(start, end))
        #     date_list = [d[0].strftime('%Y-%m-%d') for d in db_est_count]
        #     self.assertEqual(len(date_list), 8, "Miss days in download attr {}".format(' ,'.join(map(str, date_list))))

send_message()



In [0]:

class TestStoreMetaDB(PipelineTest):
    trigger_date_config = ("13 * * * *", 122)
    
    def test_store_meta_db(self):
        print self.check_date
send_message()


In [0]:
%%sh

cat /tmp/db_check.log


In [0]:

from applications.db_check_v1.common.html_report_test_runner import HTMLTestRunner

def send_message():
    log_file = "/tmp/db_check.log"
    with open(log_file, "w") as html_file:
        suite = unittest.TestSuite()
        suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestStoreMetaDB))

        runner = HTMLTestRunner(
            stream=html_file,
            title='DB Test Report',
            description='This db_check the report output by Tech Team.'
        )

        failed_count = 0
        result_list = runner.run(suite).result
        for result in result_list:
            if result[0] == 1 or result[0] == 2:
                failed_count += 1

    with open(log_file, 'r') as html_file:
        today = datetime.date.today()
        str_today = today.strftime("%Y-%m-%d")

        title = "Data Refresh Multiple Check Report - " + str_today + " - "
        if failed_count == 0:
            title += "Passed"
        else:
            title += "Failed"
        print html_file.read()
        # send_db_check_email(title, html_file.read())
        



In [0]:

l = [Row(date=datetime.date(2020, 7, 11)), Row(date=datetime.date(2020, 7, 6)), Row(date=datetime.date(2020, 7, 5)), Row(date=datetime.date(2020, 7, 9)), Row(date=datetime.date(2020, 7, 10)), Row(date=datetime.date(2020, 7, 7)), Row(date=datetime.date(2020, 7, 8))]

print len(l)

In [0]:
# Copyright (c) 2019 App Annie Inc. All rights reserved.

"""
Test DNA tag/hq/unified_app tables
"""

import datetime
from collections import defaultdict

from applications.db_check_v1.common.constants import query, citus_settings

from applications.db_check_v1.common.base_test import PipelineTest


class DnaHQUnifiedData(object):
    """
    Get data from unified s3 bucket, then return a DataFrame
    """
    _unified_s3_path = "s3://b2c-prod-data-pipeline-unified-dna/unified/dna.company-hq-mapping.v1/" \
                       "dimension/_update_date={}/_update_hour={}"

    def __init__(self, spark):
        self.spark = spark

    def get(self, date, hour):
        """
        :param date:  update date in s3 bucket path
        :type date: string
        :param hour:  update hour in s3 bucket
        :type hour: string
        :return: unified_df
        :rtype: DataFrame
        """
        unified_df = self.spark.read.parquet(self._unified_s3_path.format(date, hour))
        return unified_df


class DnaTagUnifiedData(object):
    """
    Get data from unified s3 bucket, then return a DataFrame
    """
    _unified_s3_path = "s3://b2c-prod-data-pipeline-unified-dna/unified/dna.app-tag-mapping.v1/" \
                       "dimension/_update_date={}/_update_hour={}"

    def __init__(self, spark):
        self.spark = spark

    def get(self, date, hour):
        """
        :param date:  update date in s3 bucket path
        :type date: string
        :param hour:  update hour in s3 bucket
        :type hour: string
        :return: unified_df
        :rtype: DataFrame
        """
        unified_df = self.spark.read.parquet(self._unified_s3_path.format(date, hour))
        return unified_df


class DnaUnifiedAppSingleAppData(object):
    """
    Get data from unified s3 bucket, then return a DataFrame
    """
    _unified_s3_path = "s3://b2c-prod-data-pipeline-unified-dna/unified/" \
                       "dna.in-unified-product-product-mapping.v1/dimension/_update_date={}/_update_hour={}"

    def __init__(self, spark):
        self.spark = spark

    def get(self, date, hour):
        """
        :param date:  update date in s3 bucket path
        :type date: string
        :param hour:  update hour in s3 bucket
        :type hour: string
        :return: unified_df
        :rtype: DataFrame
        """
        unified_df = self.spark.read.parquet(self._unified_s3_path.format(date, hour)).where("status=1")
        return unified_df


def get_dna_db_data(sql):
    result = query(citus_settings("store"), sql)
    return result


class TestDNAHQ(PipelineTest):
    trigger_date_config = ("0 2,14 * * *", 0)

    sql_hq = "select count(1) from dna.dna_company_hq_dim_v1"

    def setUp(self):
        super(TestDNAHQ, self).setUp()

    def test_dna_hq_completeness(self):
        self.check_date = self._get_check_date_from_routing_config(self.trigger_datetime)
        print type(self.check_date)
        day = self.check_date_str
        hour = self.check_date.strftime("%-H")
        unified_df = DnaHQUnifiedData(self.spark).get(day, hour)
        unified_hq_count = unified_df.count()
        self.assertNotEqual(unified_hq_count, 0, "DNA HQ unified data is not generated!")
        db_hq_count = get_dna_db_data(self.sql_hq)[0][0]
        print unified_hq_count, db_hq_count
        self.assertEqual(unified_hq_count, db_hq_count,
                         "DNA HQ unified data is not equals to db data! unified is {}, db is {}".format(
                             unified_hq_count, db_hq_count))


class TestDNATAG(PipelineTest):
    trigger_date_config = ("0 1,7,13,19 * * *", 0)
    sql_tag = "select tag_type_code, count(1) from dna.dna_app_tag_dim_v1 " \
              "where tag_type_code in ('genre', 'modifier') group by tag_type_code"

    def setUp(self):
        super(TestDNATAG, self).setUp()

    def test_dna_tag(self):
        self.check_date = self._get_check_date_from_routing_config(self.trigger_datetime)
        print type(self.check_date)
        day = self.check_date_str
        hour = self.check_date.strftime("%-H")
        unified_df = DnaTagUnifiedData(self.spark).get(day, hour)
        unified_tag_genre_count = unified_df.where("tag_type_code = 'genre'").count()
        unified_tag_modifier_count = unified_df.where("tag_type_code = 'modifier'").count()

        self.assertNotEqual(unified_tag_genre_count, 0, "DNA tag genre unified data is not generated!")
        self.assertNotEqual(unified_tag_modifier_count, 0, "DNA tag modifier unified data is not generated!")

        print get_dna_db_data(self.sql_tag)
        db_tag_modifier_count = get_dna_db_data(self.sql_tag)[0][1]
        db_tag_genre_count = get_dna_db_data(self.sql_tag)[1][1]

        self.assertEqual(unified_tag_genre_count, db_tag_genre_count,
                         "DNA Tag unified data is not equals to db genre data! unified is {}, db is {}".format(
                             unified_tag_genre_count, db_tag_genre_count))
        self.assertEqual(unified_tag_modifier_count, db_tag_modifier_count,
                         "DNA Tag unified data is not equals to db modifier data! unified is {}, db is {}".format(
                             unified_tag_modifier_count, db_tag_modifier_count))


class TestDNAUnifiedSingleAPP(PipelineTest):
    trigger_date_config = ("0 1 * * *", 0)
    sql_unified_single_app = "select count(1) from dna.dna_app_unified_dim_v1"

    def test_dna_unified_app_single_app(self):
        check_date_str, check_hour_str = _get_date_from_refresh_routing_config(trigger_date_config, "hour")
        trigger_datetime = datetime.datetime.strptime("2020-07-14 17:00:00", '%Y-%m-%d %H:%M:%S')
        self._get_check_date_from_routing_config(trigger_datetime).strftime("%Y-%m-%d")
        unified_df = DnaUnifiedAppSingleAppData(self.spark).get(check_date_str, check_hour_str)
        unified_app_count = unified_df.count()
        self.assertNotEqual(unified_app_count, 0, "unified data is not generated!")
        db_unified_app_count = get_dna_db_data(self.sql_unified_single_app)[0][0]
        self.assertEqual(unified_app_count, db_unified_app_count,
                         "Unified/single app data is not equals to db data! unified is {}, db is {}".format(
                             unified_app_count, db_unified_app_count))
