In [0]:

from pyspark.sql import Row
import random
from pyspark.sql.functions import lit, coalesce
from pyspark.sql.types import DoubleType
import datetime
from dateutil.relativedelta import relativedelta


test_result = []
device_code_agg_mapping = {'android-phone': 'android-all', 'android-tablet': 'android-all',
                           'ios-phone': 'ios-all', 'ios-tablet': 'ios-all'}


def last_day_of_month(check_month):
    next_month = check_month.replace(day=28) + datetime.timedelta(days=4)
    return next_month - datetime.timedelta(days=next_month.day)


def get_monthly_date_list():
    result = []
    end = datetime.date(2019, 10, 31)
    start = datetime.date(2013, 01, 31)
    while start <= end:
        start = last_day_of_month(start)
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(months=1)
    return result


def get_weekly_date_list():
    result = []
    end = datetime.date(2019, 10, 31)
    start = datetime.date(2013, 01, 12)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(weeks=1)
    return result


def get_daily_date_list():
    result = []
    end = datetime.date(2019, 10, 31)
    start = datetime.date(2015, 12, 27)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(days=1)
    return result


def get_path_date_list(granularity):
    date_list = {}
    if granularity == 'daily':
        collect_date = get_daily_date_list()
    if granularity == 'weekly':
        collect_date = get_weekly_date_list()
    if granularity == 'monthly':
        collect_date = get_monthly_date_list()
    for x in collect_date:
        if date_list.has_key(x[0][:7]):
            date_list[x[0][:7]].append(x[0])
        else:
            date_list[x[0][:7]] = [x[0]]
    date_list = sorted(date_list.items(), key=lambda x: datetime.datetime.strptime(x[0] + str(-01), '%Y-%m-%d'),
                        reverse=False)
    return date_list


def check_usage_unified_v1_v3_accuracy(_granularity, date_list):
    unified_v1_path = 's3://b2c-prod-data-pipeline-unified-usage/unified/' \
                   'usage.basic-kpi.v1/fact/granularity={v1_granularity}/date={v1_date}/'
    unified_v3_path = 's3://b2c-prod-data-pipeline-unified-usage/unified/' \
                   'usage.basic-kpi.v5/fact/granularity={v3_granularity}/date={v3_date}/'
    for month in date_list:
        sample_index = random.randint(0, len(month[1])-1)
        date = month[1][sample_index]
        unified_v1_path_parse = unified_v1_path.format(v1_granularity=_granularity, v1_date=date)
        unified_v3_path_parse = unified_v3_path.format(v3_granularity=_granularity, v3_date=date)

        unified_v1 = spark.read.parquet(unified_v1_path_parse)
        unified_v1 = (
            unified_v1
                .withColumn('est_average_time_per_user', coalesce(unified_v1['est_average_time_per_user'],
                                                                  unified_v1['est_average_session_duration'] * unified_v1['est_average_session_per_user']))
                .withColumn('est_average_session_duration', unified_v1['est_average_session_duration'] * 1000)
        )
        unified_v1 = (
            unified_v1
                .withColumn('est_average_bytes_per_session', unified_v1['est_average_bytes_per_session'] * 1024 * 1024)
                .withColumn('est_average_bytes_per_user', unified_v1['est_average_bytes_per_user'] * 1024 * 1024)
                .withColumn('est_share_of_category_bytes', unified_v1['est_share_of_category_bytes'] * 1024 * 1024)
                .withColumn('est_install_base',
                            unified_v1['est_install_penetration'] *
                            unified_v1['est_average_active_users'] / unified_v1['est_usage_penetration'])
                .withColumn('est_population', unified_v1['est_average_active_users'] / unified_v1['est_usage_penetration'])
                .withColumn('est_share_of_device_mb', lit(None).cast(DoubleType()))
                .withColumn('est_share_of_device_session', lit(None).cast(DoubleType()))
                .withColumn('est_share_of_device_time', lit(None).cast(DoubleType()))
                .withColumnRenamed('est_average_active_users_country_share', 'est_share_of_users')
                .withColumnRenamed('est_installs_country_share', 'est_share_of_installs')
                .withColumn('est_total_sessions',
                            unified_v1['est_average_active_users'] * unified_v1['est_average_session_per_user'])
                .withColumn('est_total_time',
                            unified_v1['est_average_active_users'] * unified_v1['est_average_time_per_user'] / 60)
        )
        unified_v1 = unified_v1.na.fill(0)
        unified_v1.createOrReplaceTempView("df")
        agg_df = spark.sql("""
                      select
                       app_id, country_code,
                       case 
                           when device_code in ('ios-phone', 'ios-tablet') then 'ios-all' 
                           else 'android-all' 
                       end as device_code,
                       SUM(est_average_active_days * est_average_active_users) / SUM(est_average_active_users) as est_average_active_days,
                       SUM(est_average_session_duration * est_average_active_users) / SUM(est_average_active_users) AS est_average_session_duration, 
                       SUM(est_average_session_per_user * est_average_active_users) / SUM(est_average_active_users) AS est_average_session_per_user,
                       SUM(est_average_time_per_user * est_average_active_users) / SUM(est_average_active_users) AS est_average_time_per_user,
                       SUM(est_average_bytes_per_session * est_average_active_users) / SUM(est_average_active_users) AS est_average_bytes_per_session,
                       SUM(est_average_bytes_per_user * est_average_active_users) / SUM(est_average_active_users) AS est_average_bytes_per_user,
                       SUM(est_mb_per_second * est_population) / SUM(est_population) AS est_mb_per_second,
                       SUM(est_percent_of_wifi_total * est_population) / SUM(est_population) AS est_percent_of_wifi_total, 
                       SUM(est_percentage_active_days * est_average_active_users) / SUM(est_average_active_users) AS est_percentage_active_days,
                       SUM(est_average_active_users) AS est_average_active_users, 
                       SUM(est_install_penetration * est_population) / SUM(est_population) AS est_install_penetration,
                       SUM(est_installs*est_population) / SUM(est_population) AS est_installs,
                       SUM(est_usage_penetration * est_population) / SUM(est_population) AS est_usage_penetration,
                       SUM(est_install_base) AS est_install_base,
                       mean(est_population) as est_population,
                       SUM(est_total_time) AS est_total_time, 
                       SUM(est_share_of_category_time*est_population) / SUM(est_population) AS est_share_of_category_time,
                       SUM(est_share_of_device_time*est_population) / SUM(est_population) AS est_share_of_device_time, 
                       SUM(est_share_of_category_session*est_population) / SUM(est_population) AS est_share_of_category_session,
                       SUM(est_share_of_device_session*est_population) / SUM(est_population) AS est_share_of_device_session, 
                       SUM(est_share_of_category_bytes*est_population) / SUM(est_population) AS est_share_of_category_bytes,
                       SUM(est_share_of_device_mb*est_population)/SUM(est_population) AS est_share_of_device_mb,
                       SUM(est_panel_size*est_population) / SUM(est_population) AS est_panel_size
                       from df 
                       group by app_id, country_code, 
                       case 
                           when device_code in ('ios-phone', 'ios-tablet') then 'ios-all' 
                           else 'android-all' 
                       end
            """)
        agg_df.createOrReplaceTempView("agg_df")
        agg_more_df = spark.sql("""
            select 
                agg_df.*,
                agg_df.est_usage_penetration/agg_df.est_install_penetration AS est_open_rate, 
                agg_df.est_average_active_users * agg_df.est_average_session_per_user AS est_total_sessions, 
                agg_df.est_install_base/ww.est_install_base as est_share_of_installs, 
                agg_df.est_average_active_users/ww.est_average_active_users as est_share_of_users
            from agg_df left join 
                (select device_code, app_id, country_code 
                 est_install_base, est_average_active_users 
                 from agg_df where country_code ='WW'
                 ) AS ww
            on (agg_df.app_id=ww.app_id) and (agg_df.device_code = ww.device_code)
            """)
        unified_v1_agg = agg_more_df.union(unified_v1.drop("_identifier").select(agg_more_df.columns))
        unified_v1_agg = unified_v1_agg.na.fill(0)
        unified_v1_agg.createOrReplaceTempView("unified_v1_agg")

        unified_v3 = spark.read.parquet(unified_v3_path_parse).drop("_identifier").distinct()
        unified_v3.createOrReplaceTempView("unified_v3")
        
        subtract_df = spark.sql("""
            (select * from unified_v1_agg) EXCEPT (select * from unified_v3)
        """)
        subtract_df.show()
        # print unified_v1_agg.columns
        # print unified_v1_agg.filter("app_id=20600000770868 and country_code='KR' and device_code='android-all'").collect()
        # print unified_v3.filter("app_id=20600000770868 and country_code='KR' and device_code='android-all'").collect()
        subtract_count = unified_v1_agg.select(unified_v3.columns).subtract(unified_v3).count()
        if subtract_count != 0:
            print 'Accuracy Test FAIL!!!! subtract count: {}, date: {}'.format(subtract_count, date)
        else:
            print 'date: {} test PASS, subtract count: {}'.format(date, subtract_count)


granularity_list = ["daily"]
for granularity in granularity_list:
    check_usage_unified_v1_v3_accuracy(granularity, get_path_date_list(granularity))
print 'pass'

In [0]:

df = spark.read.parquet("s3://b2c-prod-data-pipeline-qa/aa.usage/result_usage_unified_v1_v3_accuracy_0515/daily/")
df.show()

In [0]:

from pyspark.sql import Row
import random
from pyspark.sql.functions import lit, coalesce
from pyspark.sql.types import DoubleType
import datetime
from dateutil.relativedelta import relativedelta


test_result = []
device_code_agg_mapping = {'android-phone': 'android-all', 'android-tablet': 'android-all',
                           'ios-phone': 'ios-all', 'ios-tablet': 'ios-all'}


def last_day_of_month(check_month):
    next_month = check_month.replace(day=28) + datetime.timedelta(days=4)
    return next_month - datetime.timedelta(days=next_month.day)


def get_monthly_date_list():
    result = []
    end = datetime.date(2019, 10, 31)
    start = datetime.date(2013, 01, 31)
    while start <= end:
        start = last_day_of_month(start)
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(months=1)
    return result


def get_weekly_date_list():
    result = []
    end = datetime.date(2019, 10, 31)
    start = datetime.date(2013, 01, 12)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(weeks=1)
    return result


def get_daily_date_list():
    result = []
    end = datetime.date(2019, 10, 31)
    start = datetime.date(2015, 12, 27)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(days=1)
    return result


def get_path_date_list(granularity):
    date_list = {}
    if granularity == 'daily':
        collect_date = get_daily_date_list()
    if granularity == 'weekly':
        collect_date = get_weekly_date_list()
    if granularity == 'monthly':
        collect_date = get_monthly_date_list()
    for x in collect_date:
        if date_list.has_key(x[0][:7]):
            date_list[x[0][:7]].append(x[0])
        else:
            date_list[x[0][:7]] = [x[0]]
    date_list = sorted(date_list.items(), key=lambda x: datetime.datetime.strptime(x[0] + str(-01), '%Y-%m-%d'),
                        reverse=False)
    return date_list


def check_not_empty(df):
    empty_count = df.filter("est_average_active_users is null").count()
    if empty_count != 0:
        print "Accuracy Test Fail!!! AU is Null!!!"
    else:
        print "AU is Not Empty Check Pass!"


def check_percentage_accuracy(df):
    illegal_percentage_count = df.filter(
        "est_install_penetration>1 or est_open_rate>1 or est_percent_of_wifi_total>1 "
        "or est_percentage_active_days>1 or est_share_of_category_bytes>1 or est_share_of_category_session>1 "
        "or est_share_of_category_time>1 or est_share_of_device_mb>1 or est_share_of_device_session>1 "
        "or est_share_of_device_time>1 or est_share_of_installs>1 or est_share_of_users>1 "
        "or est_usage_penetration>1").count()
    if illegal_percentage_count != 0:
        print "Accuracy Test Fail!!! Percentage > 1!!!"
    else:
        print "Percentage>1 Check Pass!"


def check_usage_unified_v1_v3_accuracy(_granularity, date_list):
    unified_v1_path = 's3://b2c-prod-data-pipeline-unified-usage/unified/' \
                   'usage.basic-kpi.v1/fact/granularity={v1_granularity}/date={v1_date}/'
    unified_v3_path = 's3://b2c-prod-data-pipeline-unified-usage/unified/' \
                   'usage.basic-kpi.v5/fact/granularity={v3_granularity}/date={v3_date}/'
    for month in date_list:
        sample_index = random.randint(0, len(month[1])-1)
        date = month[1][sample_index]
        unified_v1_path_parse = unified_v1_path.format(v1_granularity=_granularity, v1_date=date)
        unified_v3_path_parse = unified_v3_path.format(v3_granularity=_granularity, v3_date=date)

        unified_v1 = spark.read.parquet(unified_v1_path_parse)
        unified_v1 = (
            unified_v1
                .withColumn('est_average_time_per_user', coalesce(unified_v1['est_average_time_per_user'],
                                                                  unified_v1['est_average_session_duration'] * unified_v1['est_average_session_per_user']))
                .withColumn('est_average_session_duration', unified_v1['est_average_session_duration'] * 1000)
        )
        unified_v1 = (
            unified_v1
                .withColumn('est_average_bytes_per_session', unified_v1['est_average_bytes_per_session'] * 1024 * 1024)
                .withColumn('est_average_bytes_per_user', unified_v1['est_average_bytes_per_user'] * 1024 * 1024)
                .withColumn('est_share_of_category_bytes', unified_v1['est_share_of_category_bytes'] * 1024 * 1024)
                .withColumn('est_install_base',
                            unified_v1['est_install_penetration'] *
                            unified_v1['est_average_active_users'] / unified_v1['est_usage_penetration'])
                .withColumn('est_population', unified_v1['est_average_active_users'] / unified_v1['est_usage_penetration'])
                .withColumn('est_share_of_device_mb', lit(None).cast(DoubleType()))
                .withColumn('est_share_of_device_session', lit(None).cast(DoubleType()))
                .withColumn('est_share_of_device_time', lit(None).cast(DoubleType()))
                .withColumnRenamed('est_average_active_users_country_share', 'est_share_of_users')
                .withColumnRenamed('est_installs_country_share', 'est_share_of_installs')
                .withColumn('est_total_sessions',
                            unified_v1['est_average_active_users'] * unified_v1['est_average_session_per_user'])
                .withColumn('est_total_time',
                            unified_v1['est_average_active_users'] * unified_v1['est_average_time_per_user'] / 60)
        )
        unified_v1 = unified_v1.na.fill(0)
        unified_v1.createOrReplaceTempView("df")
        agg_df = spark.sql("""
                      select
                       app_id, country_code,
                       case 
                           when device_code in ('ios-phone', 'ios-tablet') then 'ios-all' 
                           else 'android-all' 
                       end as device_code,
                       sum(est_average_active_days * est_average_active_users) / sum(est_average_active_users) as est_average_active_days,
                       sum(est_average_session_duration * est_average_active_users) / sum(est_average_active_users) AS est_average_session_duration, 
                       SUM(est_average_session_per_user * est_average_active_users) / SUM(est_average_active_users) AS est_average_session_per_user,
                       SUM(est_average_time_per_user * est_average_active_users) / SUM(est_average_active_users) AS est_average_time_per_user,
                       SUM(est_average_bytes_per_session * est_average_active_users) / SUM(est_average_active_users) AS est_average_bytes_per_session,
                       SUM(est_average_bytes_per_user * est_average_active_users) / SUM(est_average_active_users) AS est_average_bytes_per_user,
                       SUM(est_mb_per_second * est_population) / SUM(est_population) AS est_mb_per_second,
                       SUM(est_percent_of_wifi_total * est_population) / SUM(est_population) AS est_percent_of_wifi_total, 
                       SUM(est_percentage_active_days * est_average_active_users) / SUM(est_average_active_users) AS est_percentage_active_days,
                       SUM(est_average_active_users) AS est_average_active_users, 
                       SUM(est_install_penetration * est_population) / SUM(est_population) AS est_install_penetration,
                       SUM(est_installs*est_population) / SUM(est_population) AS est_installs,
                       SUM(est_usage_penetration * est_population) / SUM(est_population) AS est_usage_penetration,
                       SUM(est_install_base) AS est_install_base,
                       mean(est_population) as est_population,
                       SUM(est_total_time) AS est_total_time, 
                       SUM(est_share_of_category_time*est_population) / SUM(est_population) AS est_share_of_category_time,
                       SUM(est_share_of_device_time*est_population) / SUM(est_population) AS est_share_of_device_time, 
                       SUM(est_share_of_category_session*est_population) / SUM(est_population) AS est_share_of_category_session,
                       SUM(est_share_of_device_session*est_population) / SUM(est_population) AS est_share_of_device_session, 
                       SUM(est_share_of_category_bytes*est_population) / SUM(est_population) AS est_share_of_category_bytes,
                       SUM(est_share_of_device_mb*est_population)/SUM(est_population) AS est_share_of_device_mb,
                       SUM(est_panel_size*est_population) / SUM(est_population) AS est_panel_size
                       from df 
                       group by app_id, country_code, 
                       case 
                           when device_code in ('ios-phone', 'ios-tablet') then 'ios-all' 
                           else 'android-all' 
                       end
            """)
        agg_df.createOrReplaceTempView("agg_df")
        agg_more_df = spark.sql("""
            select 
                agg_df.*,
                agg_df.est_usage_penetration/agg_df.est_install_penetration AS est_open_rate, 
                agg_df.est_average_active_users * agg_df.est_average_session_per_user AS est_total_sessions, 
                agg_df.est_install_base/ww.est_install_base as est_share_of_installs, 
                agg_df.est_average_active_users/ww.est_average_active_users as est_share_of_users
            from agg_df left join 
                (select device_code, app_id, country_code 
                 est_install_base, est_average_active_users 
                 from agg_df where country_code ='WW'
                 ) AS ww
            on (agg_df.app_id=ww.app_id) and (agg_df.device_code = ww.device_code)
            """)
        unified_v1_agg = agg_more_df.union(unified_v1.drop("_identifier").select(agg_more_df.columns))
        unified_v1_agg = unified_v1_agg.na.fill(0)

        unified_v3 = spark.read.parquet(unified_v3_path_parse).distinct().drop('_identifier')
        
        check_percentage_accuracy(unified_v3)
        check_not_empty(unified_v3)

        subtract_count = unified_v1_agg.select(unified_v3.columns).subtract(unified_v3).count()
        subtract_count_reverse = unified_v3.select(unified_v1_agg.columns).subtract(unified_v1_agg).count()
        if subtract_count != 0 or subtract_count_reverse != 0:
            print 'Accuracy Test FAIL!!!! subtract count: {}, date: {}'.format(
                max(subtract_count, subtract_count_reverse), date)
        else:
            print 'Accuracy Test PASS! subtract count: {}, date: {}'.format(
                max(subtract_count, subtract_count_reverse), date)
        test_result.append((_granularity, max(subtract_count, subtract_count_reverse), date))
    df_write_result = spark.createDataFrame(test_result, schema=['type', 'subtract_count', 'date'])

    from aadatapipelinecore.core.utils.retry import retry
    def write_test_result(df_write_result):
        df_write_result.write.format("delta").save(
            "s3://b2c-prod-data-pipeline-qa/aa.usage/result_usage_unified_v1_v3_accuracy_0515/daily/",
            mode="append",
            partitionBy=["type"])
    retry(write_test_result, (df_write_result,), {}, interval=10)


granularity_list = ["daily"]
for granularity in granularity_list:
    check_usage_unified_v1_v3_accuracy(granularity, get_path_date_list(granularity))
print 'pass'

In [0]:

df = spark.read.parquet('s3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v5/fact/granularity=daily/date=2015-12-29/')
df.select('app_id', 'device_code', 'country_code', 'est_install_penetration', 'est_open_rate', 'est_percent_of_wifi_total', 'est_percentage_active_days', 'est_share_of_category_bytes', 'est_share_of_category_session', 'est_share_of_category_time', 'est_share_of_device_mb', 'est_share_of_device_session', 'est_share_of_device_time',
'est_share_of_installs', 'est_share_of_users', 'est_usage_penetration').filter(
    "est_install_penetration>1 or est_open_rate>1 or est_percent_of_wifi_total>1 "
    "or est_percentage_active_days>1 or est_share_of_category_bytes>1 or est_share_of_category_session>1 "
    "or est_share_of_category_time>1 or est_share_of_device_mb>1 or est_share_of_device_session>1 "
    "or est_share_of_device_time>1 or est_share_of_installs>1 "
    "or est_usage_penetration>1").select('est_open_rate', 'est_share_of_users').show()


In [0]:

df = spark.read.parquet('s3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v5/fact/granularity=daily/date=2015-12-29/')
df.select('app_id', 'device_code', 'country_code', 'est_average_active_users', 'est_share_of_users').filter("app_id=326251330 and device_code='ios-all'").show()
df2 = spark.read.parquet('s3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v1/fact/granularity=daily/date=2015-12-29/')
df2.select('app_id', 'device_code', 'country_code', 'est_average_active_users', 'est_average_active_users_country_share').filter("app_id=326251330 and device_code='ios-phone'").show()

In [0]:

from pyspark.sql import Row
import random
from pyspark.sql.functions import lit, coalesce
from pyspark.sql.types import DoubleType
import datetime
from dateutil.relativedelta import relativedelta

test_result = []
device_code_agg_mapping = {'android-phone': 'android-all', 'android-tablet': 'android-all',
                           'ios-phone': 'ios-all', 'ios-tablet': 'ios-all'}


def last_day_of_month(check_month):
    next_month = check_month.replace(day=28) + datetime.timedelta(days=4)
    return next_month - datetime.timedelta(days=next_month.day)


def get_monthly_date_list():
    result = []
    end = datetime.date(2020, 4, 30)
    start = datetime.date(2020, 1, 31)
    while start <= end:
        start = last_day_of_month(start)
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(months=1)
    return result


def get_weekly_date_list():
    result = []
    end = datetime.date(2020, 5, 23)
    start = datetime.date(2020, 1, 4)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(weeks=1)
    return result


def get_daily_date_list():
    result = []
    end = datetime.date(2020, 5, 23)
    start = datetime.date(2020, 1, 1)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(days=1)
    return result


def get_path_date_list(gran):
    date_list = {}
    if gran == 'daily':
        collect_date = get_daily_date_list()
    if gran == 'weekly':
        collect_date = get_weekly_date_list()
    if gran == 'monthly':
        collect_date = get_monthly_date_list()
    for x in collect_date:
        if x[0][:7] in date_list:
            date_list[x[0][:7]].append(x[0])
        else:
            date_list[x[0][:7]] = [x[0]]
    date_list = sorted(date_list.items(), key=lambda d: datetime.datetime.strptime(d[0] + str(-1), '%Y-%m-%d'),
                       reverse=False)
    return date_list


def check_not_empty(df, date):
    empty_count = df.filter("est_average_active_users is null").count()
    if empty_count != 0:
        print "AU is Not Empty Test Fail!!! empty_count: {}, date: {}".format(empty_count, date)
    else:
        print "AU is Not Empty Test Pass! date: {}".format(date)


def check_percentage_accuracy(df, date):
    illegal_percentage_count = df.filter(
        "est_install_penetration>1 or est_open_rate>1 or est_percent_of_wifi_total>1 "
        "or est_percentage_active_days>1 or est_share_of_category_bytes>1 or est_share_of_category_session>1 "
        "or est_share_of_category_time>1 or est_share_of_installs>1 or est_share_of_users>1 "
        "or est_usage_penetration>1").count()
    if illegal_percentage_count != 0:
        print "Percentage<1 Test Fail!!! illegal_percentage_count: {}, date: {}".format(illegal_percentage_count, date)
    else:
        print "Percentage<1 Test Pass! date: {}".format(date)


def check_usage_unified_v1_v3_accuracy(_granularity, date_list):
    """
        date_list:
                [(month,[day1,day2,day3])]
        sample:
            [('2015-12', ['2015-12-27', '2015-12-28', '2015-12-29', '2015-12-30', '2015-12-31'])]
    """
    for month in date_list:
        unified_v1_path = 's3://b2c-prod-data-pipeline-unified-usage/unified/app-tech.usage.basic-kpi.v3/fact/' \
                          'granularity={v1_granularity}/date={v1_date}/'
        unified_v3_path = 's3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v3/fact/' \
                          'granularity={v3_granularity}/date={v3_date}/s3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v3/fact/' \
                          'granularity={v3_granularity}/date={v3_date}/'
        sample_index = random.randint(0, len(month[1]) - 1)
        date = month[1][sample_index]
        unified_v1_path_parse = unified_v1_path.format(v1_granularity=_granularity, v1_date=date)
        unified_v3_path_parse = unified_v3_path.format(v3_granularity=_granularity, v3_date=date)

        unified_v1 = spark.read.parquet(unified_v1_path_parse)
        unified_v1 = (
            unified_v1
            .drop("est_mb_per_second", "est_share_of_device_time", "est_share_of_device_session",
                  "est_share_of_device_mb", "est_panel_size")
            .withColumn('est_total_time',
                        unified_v1['est_average_active_users'] * unified_v1['est_average_time_per_user'] / 60)
            .withColumn('est_average_time_per_user', coalesce(unified_v1['est_average_time_per_user'],
                                                              unified_v1['est_average_session_duration'] * 
                                                              unified_v1['est_average_session_per_user']) * 1000)
            .withColumn('est_average_session_duration', unified_v1['est_average_session_duration'] * 1000)
        )
        unified_v1 = (
            unified_v1
            .withColumn('est_average_bytes_per_session', unified_v1['est_average_bytes_per_session'] * 1024 * 1024)
            .withColumn('est_average_bytes_per_user', unified_v1['est_average_bytes_per_user'] * 1024 * 1024)
            .withColumn('est_share_of_category_bytes', unified_v1['est_share_of_category_bytes'] * 1024 * 1024)
            .withColumn('est_install_base',
                        unified_v1['est_install_penetration'] *
                        unified_v1['est_average_active_users'] / unified_v1['est_usage_penetration'])
            .withColumn('est_population', unified_v1['est_average_active_users'] / unified_v1['est_usage_penetration'])
            .withColumnRenamed('est_average_active_users_country_share', 'est_share_of_users')
            .withColumnRenamed('est_installs_country_share', 'est_share_of_installs')
            .withColumn('est_total_sessions',
                        unified_v1['est_average_active_users'] * unified_v1['est_average_session_per_user'])
        )
        unified_v1.createOrReplaceTempView("v1_df")
        unified_v1 = spark.sql("""
                    select
                        v1_df.*,
                        ww.est_average_active_users as est_average_active_users_worldwide,
                        ww.est_installs as est_installs_worldwide
                    from v1_df left join
                        (select device_code, app_id, est_installs, est_average_active_users
                         from v1_df where country_code ='WW'
                         ) AS ww
                    on (v1_df.app_id=ww.app_id) and (v1_df.device_code = ww.device_code)
                    """)
        unified_v1 = unified_v1.withColumnRenamed('app_id', 'product_id')
        unified_v1 = unified_v1.na.fill(0).drop('_identifier')
        
        unified_v3 = spark.read.format("delta").load(unified_v3_path_parse).drop('_identifier', 'date', 'granularity')

        check_percentage_accuracy(unified_v3, date)
        check_not_empty(unified_v3, date)

        subtract_count = unified_v1.select(unified_v3.columns).subtract(unified_v3).count()
        subtract_count_reverse = unified_v3.select(unified_v1.columns).subtract(unified_v1).count()
        if subtract_count != 0 or subtract_count_reverse != 0:
            print 'Accuracy Test FAIL!!!! subtract count: {}, date: {}'.format(
                max(subtract_count, subtract_count_reverse), date)
        else:
            print 'Accuracy Test PASS! subtract count: {}, date: {}'.format(
                max(subtract_count, subtract_count_reverse), date)


granularity_list = ["daily"]
for granularity in granularity_list:
    check_usage_unified_v1_v3_accuracy(granularity, get_path_date_list(granularity))
print 'pass'


In [0]:

from pyspark.sql import Row
import random
from pyspark.sql.functions import lit, coalesce
from pyspark.sql.types import DoubleType
import datetime
from dateutil.relativedelta import relativedelta

test_result = []
device_code_agg_mapping = {'android-phone': 'android-all', 'android-tablet': 'android-all',
                           'ios-phone': 'ios-all', 'ios-tablet': 'ios-all'}


def last_day_of_month(check_month):
    next_month = check_month.replace(day=28) + datetime.timedelta(days=4)
    return next_month - datetime.timedelta(days=next_month.day)


def get_monthly_date_list():
    result = []
    end = datetime.date(2020, 4, 30)
    start = datetime.date(2020, 1, 31)
    while start <= end:
        start = last_day_of_month(start)
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(months=1)
    return result


def get_weekly_date_list():
    result = []
    end = datetime.date(2020, 5, 23)
    start = datetime.date(2020, 1, 4)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(weeks=1)
    return result


def get_daily_date_list():
    result = []
    end = datetime.date(2020, 5, 23)
    start = datetime.date(2020, 1, 1)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(days=1)
    return result


def get_path_date_list(gran):
    date_list = {}
    if gran == 'daily':
        collect_date = get_daily_date_list()
    if gran == 'weekly':
        collect_date = get_weekly_date_list()
    if gran == 'monthly':
        collect_date = get_monthly_date_list()
    for x in collect_date:
        if x[0][:7] in date_list:
            date_list[x[0][:7]].append(x[0])
        else:
            date_list[x[0][:7]] = [x[0]]
    date_list = sorted(date_list.items(), key=lambda d: datetime.datetime.strptime(d[0] + str(-1), '%Y-%m-%d'),
                       reverse=False)
    return date_list


def check_not_empty(df, date):
    empty_count = df.filter("est_average_active_users is null").count()
    if empty_count != 0:
        print "AU is Not Empty Test Fail!!! empty_count: {}, date: {}".format(empty_count, date)
    else:
        print "AU is Not Empty Test Pass! date: {}".format(date)


def check_percentage_accuracy(df, date):
    illegal_percentage_count = df.filter(
        "est_install_penetration>1 or est_open_rate>1 or est_percent_of_wifi_total>1 "
        "or est_percentage_active_days>1 or est_share_of_category_bytes>1 or est_share_of_category_session>1 "
        "or est_share_of_category_time>1 or est_share_of_installs>1 or est_share_of_users>1 "
        "or est_usage_penetration>1").count()
    if illegal_percentage_count != 0:
        print "Percentage<1 Test Fail!!! illegal_percentage_count: {}, date: {}".format(illegal_percentage_count, date)
    else:
        print "Percentage<1 Test Pass! date: {}".format(date)


def check_usage_unified_v1_v3_accuracy(_granularity, date_list):
    """
        date_list:
                [(month,[day1,day2,day3])]
        sample:
            [('2015-12', ['2015-12-27', '2015-12-28', '2015-12-29', '2015-12-30', '2015-12-31'])]
    """
    for month in date_list:
        unified_v1_path = 's3://b2c-prod-data-pipeline-unified-usage/unified/app-tech.usage.basic-kpi.v3/fact/' \
                          'granularity={v1_granularity}/date={v1_date}/'
        unified_v3_path = 's3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v3/fact/' \
                          'granularity={v3_granularity}/date={v3_date}/'
        sample_index = random.randint(0, len(month[1]) - 1)
        date = month[1][sample_index]
        unified_v1_path_parse = unified_v1_path.format(v1_granularity=_granularity, v1_date=date)
        unified_v3_path_parse = unified_v3_path.format(v3_granularity=_granularity, v3_date=date)

        unified_v1 = spark.read.parquet(unified_v1_path_parse)
        unified_v1 = (
            unified_v1
            .drop("est_mb_per_second", "est_share_of_device_time", "est_share_of_device_session",
                  "est_share_of_device_mb", "est_panel_size")
            .withColumn('est_total_time',
                        unified_v1['est_average_active_users'] * unified_v1['est_average_time_per_user'] / 60)
            .withColumn('est_average_time_per_user', coalesce(unified_v1['est_average_time_per_user'],
                                                              unified_v1['est_average_session_duration'] * 
                                                              unified_v1['est_average_session_per_user']) * 1000)
            .withColumn('est_average_session_duration', unified_v1['est_average_session_duration'] * 1000)
        )
        unified_v1 = (
            unified_v1
            .withColumn('est_average_bytes_per_session', unified_v1['est_average_bytes_per_session'] * 1024 * 1024)
            .withColumn('est_average_bytes_per_user', unified_v1['est_average_bytes_per_user'] * 1024 * 1024)
            .withColumn('est_share_of_category_bytes', unified_v1['est_share_of_category_bytes'] * 1024 * 1024)
            .withColumn('est_install_base',
                        unified_v1['est_install_penetration'] *
                        unified_v1['est_average_active_users'] / unified_v1['est_usage_penetration'])
            .withColumn('est_population', unified_v1['est_average_active_users'] / unified_v1['est_usage_penetration'])
            .withColumnRenamed('est_average_active_users_country_share', 'est_share_of_users')
            .withColumnRenamed('est_installs_country_share', 'est_share_of_installs')
            .withColumn('est_total_sessions',
                        unified_v1['est_average_active_users'] * unified_v1['est_average_session_per_user'])
        )
        unified_v1.createOrReplaceTempView("v1_df")
        unified_v1 = spark.sql("""
                    select
                        v1_df.*,
                        ww.est_average_active_users as est_average_active_users_worldwide,
                        ww.est_installs as est_installs_worldwide
                    from v1_df left join
                        (select device_code, app_id, est_installs, est_average_active_users
                         from v1_df where country_code ='WW'
                         ) AS ww
                    on (v1_df.app_id=ww.app_id) and (v1_df.device_code = ww.device_code)
                    """)
        unified_v1 = unified_v1.na.fill(0).drop('_identifier')
        
        unified_v3 = spark.read.format("delta").load(unified_v3_path_parse).drop('_identifier', 'date', 'granularity')

        check_percentage_accuracy(unified_v3, date)
        check_not_empty(unified_v3, date)

        subtract_count = unified_v1.select(unified_v3.columns).subtract(unified_v3).count()
        subtract_count_reverse = unified_v3.select(unified_v1.columns).subtract(unified_v1).count()
        if subtract_count != 0 or subtract_count_reverse != 0:
            print 'Accuracy Test FAIL!!!! subtract count: {}, date: {}'.format(
                max(subtract_count, subtract_count_reverse), date)
        else:
            print 'Accuracy Test PASS! subtract count: {}, date: {}'.format(
                max(subtract_count, subtract_count_reverse), date)


granularity_list = ["weekly"]
for granularity in granularity_list:
    check_usage_unified_v1_v3_accuracy(granularity, get_path_date_list(granularity))
print 'pass'

In [0]:

df = spark.read.parquet('s3://b2c-prod-data-pipeline-unified-usage/unified/app-tech.usage.basic-kpi.v3/fact/granularity=daily/date=2020-04-13/')
df.filter("app_id=20600001688061 and device_code='android-phone' and country_code='SG'").select('est_install_penetration').show()

In [0]:

from pyspark.sql import Row
import random
from pyspark.sql.functions import lit, coalesce
from pyspark.sql.types import DoubleType
import datetime
from dateutil.relativedelta import relativedelta


def last_day_of_month(check_month):
    next_month = check_month.replace(day=28) + datetime.timedelta(days=4)
    return next_month - datetime.timedelta(days=next_month.day)


def get_monthly_date_list():
    result = []
    end = datetime.date(2020, 4, 30)
    start = datetime.date(2020, 1, 31)
    while start <= end:
        start = last_day_of_month(start)
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(months=1)
    return result


def get_weekly_date_list():
    result = []
    end = datetime.date(2020, 5, 23)
    start = datetime.date(2020, 1, 4)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(weeks=1)
    return result


def get_daily_date_list():
    result = []
    end = datetime.date(2020, 5, 23)
    start = datetime.date(2020, 1, 1)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(days=1)
    return result


def get_path_date_list(gran):
    date_list = {}
    if gran == 'daily':
        collect_date = get_daily_date_list()
    if gran == 'weekly':
        collect_date = get_weekly_date_list()
    if gran == 'monthly':
        collect_date = get_monthly_date_list()
    for x in collect_date:
        if x[0][:7] in date_list:
            date_list[x[0][:7]].append(x[0])
        else:
            date_list[x[0][:7]] = [x[0]]
    date_list = sorted(date_list.items(), key=lambda d: datetime.datetime.strptime(d[0] + str(-1), '%Y-%m-%d'),
                       reverse=False)
    return date_list


def check_not_empty(df, date):
    empty_count = df.filter("est_average_active_users is null").count()
    if empty_count != 0:
        print "AU is Not Empty Test Fail!!! empty_count: {}, date: {}".format(empty_count, date)
    else:
        print "AU is Not Empty Test Pass! date: {}".format(date)


def check_percentage_accuracy(df, date):
    illegal_percentage_count = df.filter(
        "est_install_penetration>1 or est_open_rate>1 or est_percent_of_wifi_total>1 "
        "or est_percentage_active_days>1 or est_share_of_category_bytes>1 or est_share_of_category_session>1 "
        "or est_share_of_category_time>1 or est_installs_country_share>1 or est_average_active_users_country_share>1 "
        "or est_usage_penetration>1").count()
    if illegal_percentage_count != 0:
        print "Percentage<1 Test Fail!!! illegal_percentage_count: {}, date: {}".format(illegal_percentage_count, date)


def check_usage_unified_v1_v3_accuracy(_granularity, date_list):
    """
        date_list:
                [(month,[day1,day2,day3])]
        sample:
            [('2015-12', ['2015-12-27', '2015-12-28', '2015-12-29', '2015-12-30', '2015-12-31'])]
    """
    for month in date_list:
        unified_v1_path = 's3://b2c-prod-data-pipeline-unified-usage/unified/app-tech.usage.basic-kpi.v3/fact/' \
                          'granularity={v1_granularity}/date={v1_date}/'
        unified_v3_path = 's3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v3/fact/' \
                          'granularity={v3_granularity}/date={v3_date}/s3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v3/fact/' \
                          'granularity={v3_granularity}/date={v3_date}/'
        for date in month[1]:
            unified_v1_path_parse = unified_v1_path.format(v1_granularity=_granularity, v1_date=date)
            unified_v3_path_parse = unified_v3_path.format(v3_granularity=_granularity, v3_date=date)

            unified_v1 = spark.read.parquet(unified_v1_path_parse)

            check_percentage_accuracy(unified_v1, date)
            check_not_empty(unified_v1, date)



granularity_list = ["daily"]
for granularity in granularity_list:
    check_usage_unified_v1_v3_accuracy(granularity, get_path_date_list(granularity))
print 'pass'


In [0]:

from pyspark.sql import Row
import random
from pyspark.sql.functions import lit, coalesce
from pyspark.sql.types import DoubleType
import datetime
from dateutil.relativedelta import relativedelta

test_result = []
device_code_agg_mapping = {'android-phone': 'android-all', 'android-tablet': 'android-all',
                           'ios-phone': 'ios-all', 'ios-tablet': 'ios-all'}


def last_day_of_month(check_month):
    next_month = check_month.replace(day=28) + datetime.timedelta(days=4)
    return next_month - datetime.timedelta(days=next_month.day)


def get_monthly_date_list():
    result = []
    end = datetime.date(2020, 4, 30)
    start = datetime.date(2020, 1, 31)
    while start <= end:
        start = last_day_of_month(start)
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(months=1)
    return result


def get_weekly_date_list():
    result = []
    end = datetime.date(2020, 5, 23)
    start = datetime.date(2020, 1, 4)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(weeks=1)
    return result


def get_daily_date_list():
    result = []
    end = datetime.date(2020, 5, 23)
    start = datetime.date(2020, 1, 1)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(days=1)
    return result


def get_path_date_list(gran):
    date_list = {}
    if gran == 'daily':
        collect_date = get_daily_date_list()
    if gran == 'weekly':
        collect_date = get_weekly_date_list()
    if gran == 'monthly':
        collect_date = get_monthly_date_list()
    for x in collect_date:
        if x[0][:7] in date_list:
            date_list[x[0][:7]].append(x[0])
        else:
            date_list[x[0][:7]] = [x[0]]
    date_list = sorted(date_list.items(), key=lambda d: datetime.datetime.strptime(d[0] + str(-1), '%Y-%m-%d'),
                       reverse=False)
    return date_list


def check_not_empty(df, date):
    empty_count = df.filter("est_average_active_users is null").count()
    if empty_count != 0:
        print "AU is Not Empty Test Fail!!! empty_count: {}, date: {}".format(empty_count, date)
    else:
        print "AU is Not Empty Test Pass! date: {}".format(date)


def check_percentage_accuracy(df, date):
    illegal_percentage_count = df.filter(
        "est_install_penetration>1 or est_open_rate>1 or est_percent_of_wifi_total>1 "
        "or est_percentage_active_days>1 or est_share_of_category_bytes>1 or est_share_of_category_session>1 "
        "or est_share_of_category_time>1 or est_share_of_installs>1 or est_share_of_users>1 "
        "or est_usage_penetration>1").count()
    if illegal_percentage_count != 0:
        print "Percentage<1 Test Fail!!! illegal_percentage_count: {}, date: {}".format(illegal_percentage_count, date)
    else:
        print "Percentage<1 Test Pass! date: {}".format(date)


def check_usage_unified_v1_v3_accuracy(_granularity, date_list):
    """
        date_list:
                [(month,[day1,day2,day3])]
        sample:
            [('2015-12', ['2015-12-27', '2015-12-28', '2015-12-29', '2015-12-30', '2015-12-31'])]
    """
    for month in date_list:
        unified_v1_path = 's3://b2c-prod-data-pipeline-unified-usage/unified/app-tech.usage.basic-kpi.v3/fact/' \
                          'granularity={v1_granularity}/date={v1_date}/'
        unified_v3_path = 's3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v3/fact/' \
                          'granularity={v3_granularity}/date={v3_date}/'
        sample_index = random.randint(0, len(month[1]) - 1)
        date = month[1][sample_index]
        unified_v1_path_parse = unified_v1_path.format(v1_granularity=_granularity, v1_date=date)
        unified_v3_path_parse = unified_v3_path.format(v3_granularity=_granularity, v3_date=date)

        unified_v1 = spark.read.parquet(unified_v1_path_parse)
        unified_v1 = (
            unified_v1
            .drop("est_mb_per_second", "est_share_of_device_time", "est_share_of_device_session",
                  "est_share_of_device_mb", "est_panel_size")
            .withColumn('est_total_time',
                        unified_v1['est_average_active_users'] * unified_v1['est_average_time_per_user'] / 60)
            .withColumn('est_average_time_per_user', coalesce(unified_v1['est_average_time_per_user'],
                                                              unified_v1['est_average_session_duration'] * 
                                                              unified_v1['est_average_session_per_user']) * 1000)
            .withColumn('est_average_session_duration', unified_v1['est_average_session_duration'] * 1000)
        )
        unified_v1 = (
            unified_v1
            .withColumn('est_average_bytes_per_session', unified_v1['est_average_bytes_per_session'] * 1024 * 1024)
            .withColumn('est_average_bytes_per_user', unified_v1['est_average_bytes_per_user'] * 1024 * 1024)
            .withColumn('est_share_of_category_bytes', unified_v1['est_share_of_category_bytes'] * 1024 * 1024)
            .withColumn('est_install_base',
                        unified_v1['est_install_penetration'] *
                        unified_v1['est_average_active_users'] / unified_v1['est_usage_penetration'])
            .withColumn('est_population', unified_v1['est_average_active_users'] / unified_v1['est_usage_penetration'])
            .withColumnRenamed('est_average_active_users_country_share', 'est_share_of_users')
            .withColumnRenamed('est_installs_country_share', 'est_share_of_installs')
            .withColumn('est_total_sessions',
                        unified_v1['est_average_active_users'] * unified_v1['est_average_session_per_user'])
        )
        unified_v1.createOrReplaceTempView("v1_df")
        unified_v1 = spark.sql("""
                    select
                        v1_df.*,
                        ww.est_average_active_users as est_average_active_users_worldwide,
                        ww.est_installs as est_installs_worldwide
                    from v1_df left join
                        (select device_code, app_id, est_installs, est_average_active_users
                         from v1_df where country_code ='WW'
                         ) AS ww
                    on (v1_df.app_id=ww.app_id) and (v1_df.device_code = ww.device_code)
                    """)
        unified_v1 = unified_v1.na.fill(0).drop('_identifier')
        
        unified_v3 = spark.read.format("delta").load(unified_v3_path_parse).drop('_identifier', 'date', 'granularity')

        check_percentage_accuracy(unified_v3, date)
        check_not_empty(unified_v3, date)

        subtract_count = unified_v1.select(unified_v3.columns).subtract(unified_v3).count()
        subtract_count_reverse = unified_v3.select(unified_v1.columns).subtract(unified_v1).count()
        if subtract_count != 0 or subtract_count_reverse != 0:
            print 'Accuracy Test FAIL!!!! subtract count: {}, date: {}'.format(
                max(subtract_count, subtract_count_reverse), date)
        else:
            print 'Accuracy Test PASS! subtract count: {}, date: {}'.format(
                max(subtract_count, subtract_count_reverse), date)


granularity_list = ["monthly"]
for granularity in granularity_list:
    check_usage_unified_v1_v3_accuracy(granularity, get_path_date_list(granularity))
print 'pass'

In [0]:
%%sh
aws s3 ls s3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v3/fact/granularity=daily/

In [0]:

unified_v3 = spark.read.format("delta").load('s3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v3/fact/granularity=daily/date=2020-05-01/')
row_list = unified_v3.collect()
columns = unified_v3.columns
for col in columns:
    print col, '\t', row_list[0][col]
    

In [0]:

from pyspark.sql import Row
import random
from pyspark.sql.functions import lit, coalesce
from pyspark.sql.types import DoubleType
import datetime
from dateutil.relativedelta import relativedelta

test_result = []
device_code_agg_mapping = {'android-phone': 'android-all', 'android-tablet': 'android-all',
                           'ios-phone': 'ios-all', 'ios-tablet': 'ios-all'}


def last_day_of_month(check_month):
    next_month = check_month.replace(day=28) + datetime.timedelta(days=4)
    return next_month - datetime.timedelta(days=next_month.day)


def get_monthly_date_list():
    result = []
    end = datetime.date(2020, 4, 30)
    start = datetime.date(2020, 1, 31)
    while start <= end:
        start = last_day_of_month(start)
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(months=1)
    return result


def get_weekly_date_list():
    result = []
    end = datetime.date(2020, 5, 23)
    start = datetime.date(2020, 1, 4)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(weeks=1)
    return result


def get_daily_date_list():
    result = []
    end = datetime.date(2019, 12, 31)
    start = datetime.date(2018, 4, 1)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(days=1)
    return result


def get_path_date_list(gran):
    date_list = {}
    if gran == 'daily':
        collect_date = get_daily_date_list()
    if gran == 'weekly':
        collect_date = get_weekly_date_list()
    if gran == 'monthly':
        collect_date = get_monthly_date_list()
    for x in collect_date:
        if x[0][:7] in date_list:
            date_list[x[0][:7]].append(x[0])
        else:
            date_list[x[0][:7]] = [x[0]]
    date_list = sorted(date_list.items(), key=lambda d: datetime.datetime.strptime(d[0] + str(-1), '%Y-%m-%d'),
                       reverse=False)
    return date_list


def check_not_empty(df, date):
    empty_count = df.filter("est_average_active_users is null").count()
    if empty_count != 0:
        print "AU is Not Empty Test Fail!!! empty_count: {}, date: {}".format(empty_count, date)
    else:
        print "AU is Not Empty Test Pass! date: {}".format(date)


def check_percentage_accuracy(df, date):
    illegal_percentage_count = df.filter(
        "est_install_penetration>1 or est_open_rate>1 or est_percent_of_wifi_total>1 "
        "or est_percentage_active_days>1 or est_share_of_category_bytes>1 or est_share_of_category_session>1 "
        "or est_share_of_category_time>1 or est_share_of_installs>1 or est_share_of_users>1 "
        "or est_usage_penetration>1").count()
    if illegal_percentage_count != 0:
        print "Percentage<1 Test Fail!!! illegal_percentage_count: {}, date: {}".format(illegal_percentage_count, date)
    else:
        print "Percentage<1 Test Pass! date: {}".format(date)


def check_usage_unified_v1_v3_accuracy(_granularity, date_list):
    """
        date_list:
                [(month,[day1,day2,day3])]
        sample:
            [('2015-12', ['2015-12-27', '2015-12-28', '2015-12-29', '2015-12-30', '2015-12-31'])]
    """
    for month in date_list:
        unified_v1_path = 's3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v1/fact/' \
                          'granularity={v1_granularity}/date={v1_date}/'
        unified_v3_path = 's3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v3/fact/' \
                          'granularity={v3_granularity}/date={v3_date}/'
        sample_index = random.randint(0, len(month[1]) - 1)
        date = month[1][sample_index]
        unified_v1_path_parse = unified_v1_path.format(v1_granularity=_granularity, v1_date=date)
        unified_v3_path_parse = unified_v3_path.format(v3_granularity=_granularity, v3_date=date)

        unified_v1 = spark.read.parquet(unified_v1_path_parse)
        unified_v1 = (
            unified_v1
            .drop("est_mb_per_second", "est_share_of_device_time", "est_share_of_device_session",
                  "est_share_of_device_mb", "est_panel_size")
            .withColumn('est_total_time',
                        unified_v1['est_average_active_users'] * unified_v1['est_average_time_per_user'] / 60)
            .withColumn('est_average_time_per_user', coalesce(unified_v1['est_average_time_per_user'],
                                                              unified_v1['est_average_session_duration'] *
                                                              unified_v1['est_average_session_per_user']) * 1000)
            .withColumn('est_average_session_duration', unified_v1['est_average_session_duration'] * 1000)
        )
        unified_v1 = (
            unified_v1
            .withColumn('est_average_bytes_per_session', unified_v1['est_average_bytes_per_session'] * 1024 * 1024)
            .withColumn('est_average_bytes_per_user', unified_v1['est_average_bytes_per_user'] * 1024 * 1024)
            .withColumn('est_share_of_category_bytes', unified_v1['est_share_of_category_bytes'] * 1024 * 1024)
            .withColumn('est_install_base',
                        unified_v1['est_install_penetration'] *
                        unified_v1['est_average_active_users'] / unified_v1['est_usage_penetration'])
            .withColumn('est_population',
                        unified_v1['est_average_active_users'] / unified_v1['est_usage_penetration'])
            .withColumnRenamed('est_average_active_users_country_share', 'est_share_of_users')
            .withColumnRenamed('est_installs_country_share', 'est_share_of_installs')
            .withColumn('est_total_sessions',
                        unified_v1['est_average_active_users'] * unified_v1['est_average_session_per_user'])
        )
        unified_v1.createOrReplaceTempView("v1_df")
        unified_v1 = spark.sql("""
                    select
                        v1_df.*,
                        ww.est_average_active_users as est_average_active_users_worldwide,
                        ww.est_installs as est_installs_worldwide
                    from v1_df left join
                        (select device_code, app_id, est_installs, est_average_active_users
                         from v1_df where country_code ='WW'
                         ) AS ww
                    on (v1_df.app_id=ww.app_id) and (v1_df.device_code = ww.device_code)
                    """)
        unified_v1 = unified_v1.na.fill(0).drop('_identifier')

        unified_v3 = spark.read.format("delta").load(unified_v3_path_parse).drop('_identifier', 'date', 'granularity')

        check_percentage_accuracy(unified_v3, date)
        check_not_empty(unified_v3, date)

        subtract_count = unified_v1.select(unified_v3.columns).subtract(unified_v3).count()
        subtract_count_reverse = unified_v3.select(unified_v1.columns).subtract(unified_v1).count()
        if subtract_count != 0 or subtract_count_reverse != 0:
            print 'Accuracy Test FAIL!!!! subtract count: {}, date: {}'.format(
                max(subtract_count, subtract_count_reverse), date)
        else:
            print 'Accuracy Test PASS! subtract count: {}, date: {}'.format(
                max(subtract_count, subtract_count_reverse), date)


granularity_list = ["daily"]
for granularity in granularity_list:
    check_usage_unified_v1_v3_accuracy(granularity, get_path_date_list(granularity))
print 'pass'

In [0]:

from pyspark.sql import Row
import random
from pyspark.sql.functions import lit, coalesce
from pyspark.sql.types import DoubleType
import datetime
from dateutil.relativedelta import relativedelta

test_result = []
device_code_agg_mapping = {'android-phone': 'android-all', 'android-tablet': 'android-all',
                           'ios-phone': 'ios-all', 'ios-tablet': 'ios-all'}


def last_day_of_month(check_month):
    next_month = check_month.replace(day=28) + datetime.timedelta(days=4)
    return next_month - datetime.timedelta(days=next_month.day)


def get_monthly_date_list():
    result = []
    end = datetime.date(2020, 4, 30)
    start = datetime.date(2020, 1, 31)
    while start <= end:
        start = last_day_of_month(start)
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(months=1)
    return result


def get_weekly_date_list():
    result = []
    end = datetime.date(2020, 5, 23)
    start = datetime.date(2013, 1, 12)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(weeks=1)
    return result


def get_daily_date_list():
    result = []
    end = datetime.date(2019, 12, 31)
    start = datetime.date(2018, 4, 1)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(days=1)
    return result


def get_path_date_list(gran):
    date_list = {}
    if gran == 'daily':
        collect_date = get_daily_date_list()
    if gran == 'weekly':
        collect_date = get_weekly_date_list()
    if gran == 'monthly':
        collect_date = get_monthly_date_list()
    for x in collect_date:
        if x[0][:7] in date_list:
            date_list[x[0][:7]].append(x[0])
        else:
            date_list[x[0][:7]] = [x[0]]
    date_list = sorted(date_list.items(), key=lambda d: datetime.datetime.strptime(d[0] + str(-1), '%Y-%m-%d'),
                       reverse=False)
    return date_list


def check_not_empty(df, date):
    empty_count = df.filter("est_average_active_users is null").count()
    if empty_count != 0:
        print "AU is Not Empty Test Fail!!! empty_count: {}, date: {}".format(empty_count, date)
    else:
        print "AU is Not Empty Test Pass! date: {}".format(date)


def check_percentage_accuracy(df, date):
    illegal_percentage_count = df.filter(
        "est_install_penetration>1 or est_open_rate>1 or est_percent_of_wifi_total>1 "
        "or est_percentage_active_days>1 or est_share_of_category_bytes>1 or est_share_of_category_session>1 "
        "or est_share_of_category_time>1 or est_share_of_installs>1 or est_share_of_users>1 "
        "or est_usage_penetration>1").count()
    if illegal_percentage_count != 0:
        print "Percentage<1 Test Fail!!! illegal_percentage_count: {}, date: {}".format(illegal_percentage_count, date)
    else:
        print "Percentage<1 Test Pass! date: {}".format(date)


def check_usage_unified_v1_v3_accuracy(_granularity, date_list):
    """
        date_list:
                [(month,[day1,day2,day3])]
        sample:
            [('2015-12', ['2015-12-27', '2015-12-28', '2015-12-29', '2015-12-30', '2015-12-31'])]
    """
    for month in date_list:
        unified_v1_path = 's3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v1/fact/' \
                          'granularity={v1_granularity}/date={v1_date}/'
        unified_v3_path = 's3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v3/fact/' \
                          'granularity={v3_granularity}/date={v3_date}/'
        sample_index = random.randint(0, len(month[1]) - 1)
        date = month[1][sample_index]
        unified_v1_path_parse = unified_v1_path.format(v1_granularity=_granularity, v1_date=date)
        unified_v3_path_parse = unified_v3_path.format(v3_granularity=_granularity, v3_date=date)

        unified_v1 = spark.read.parquet(unified_v1_path_parse)
        unified_v1 = (
            unified_v1
            .drop("est_mb_per_second", "est_share_of_device_time", "est_share_of_device_session",
                  "est_share_of_device_mb", "est_panel_size")
            .withColumn('est_total_time',
                        unified_v1['est_average_active_users'] * unified_v1['est_average_time_per_user'] / 60)
            .withColumn('est_average_time_per_user', coalesce(unified_v1['est_average_time_per_user'],
                                                              unified_v1['est_average_session_duration'] *
                                                              unified_v1['est_average_session_per_user']) * 1000)
            .withColumn('est_average_session_duration', unified_v1['est_average_session_duration'] * 1000)
        )
        unified_v1 = (
            unified_v1
            .withColumn('est_average_bytes_per_session', unified_v1['est_average_bytes_per_session'] * 1024 * 1024)
            .withColumn('est_average_bytes_per_user', unified_v1['est_average_bytes_per_user'] * 1024 * 1024)
            .withColumn('est_share_of_category_bytes', unified_v1['est_share_of_category_bytes'] * 1024 * 1024)
            .withColumn('est_install_base',
                        unified_v1['est_install_penetration'] *
                        unified_v1['est_average_active_users'] / unified_v1['est_usage_penetration'])
            .withColumn('est_population',
                        unified_v1['est_average_active_users'] / unified_v1['est_usage_penetration'])
            .withColumnRenamed('est_average_active_users_country_share', 'est_share_of_users')
            .withColumnRenamed('est_installs_country_share', 'est_share_of_installs')
            .withColumn('est_total_sessions',
                        unified_v1['est_average_active_users'] * unified_v1['est_average_session_per_user'])
        )
        unified_v1.createOrReplaceTempView("v1_df")
        unified_v1 = spark.sql("""
                    select
                        v1_df.*,
                        ww.est_average_active_users as est_average_active_users_worldwide,
                        ww.est_installs as est_installs_worldwide
                    from v1_df left join
                        (select device_code, app_id, est_installs, est_average_active_users
                         from v1_df where country_code ='WW'
                         ) AS ww
                    on (v1_df.app_id=ww.app_id) and (v1_df.device_code = ww.device_code)
                    """)
        unified_v1 = unified_v1.na.fill(0).drop('_identifier')

        unified_v3 = spark.read.format("delta").load(unified_v3_path_parse).drop('_identifier', 'date', 'granularity')

        check_percentage_accuracy(unified_v3, date)
        check_not_empty(unified_v3, date)

        subtract_count = unified_v1.select(unified_v3.columns).subtract(unified_v3).count()
        subtract_count_reverse = unified_v3.select(unified_v1.columns).subtract(unified_v1).count()
        if subtract_count != 0 or subtract_count_reverse != 0:
            print 'Accuracy Test FAIL!!!! subtract count: {}, date: {}'.format(
                max(subtract_count, subtract_count_reverse), date)
        else:
            print 'Accuracy Test PASS! subtract count: {}, date: {}'.format(
                max(subtract_count, subtract_count_reverse), date)


granularity_list = ["weekly"]
for granularity in granularity_list:
    check_usage_unified_v1_v3_accuracy(granularity, get_path_date_list(granularity))
print 'pass'

In [0]:

from pyspark.sql import Row
import random
from pyspark.sql.functions import lit, coalesce
from pyspark.sql.types import DoubleType
import datetime
from dateutil.relativedelta import relativedelta

test_result = []
device_code_agg_mapping = {'android-phone': 'android-all', 'android-tablet': 'android-all',
                           'ios-phone': 'ios-all', 'ios-tablet': 'ios-all'}


def last_day_of_month(check_month):
    next_month = check_month.replace(day=28) + datetime.timedelta(days=4)
    return next_month - datetime.timedelta(days=next_month.day)


def get_monthly_date_list():
    result = []
    end = datetime.date(2020, 4, 30)
    start = datetime.date(2013, 1, 31)
    while start <= end:
        start = last_day_of_month(start)
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(months=1)
    return result


def get_weekly_date_list():
    result = []
    end = datetime.date(2020, 5, 23)
    start = datetime.date(2013, 1, 12)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(weeks=1)
    return result


def get_daily_date_list():
    result = []
    end = datetime.date(2019, 12, 31)
    start = datetime.date(2018, 4, 1)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(days=1)
    return result


def get_path_date_list(gran):
    date_list = {}
    if gran == 'daily':
        collect_date = get_daily_date_list()
    if gran == 'weekly':
        collect_date = get_weekly_date_list()
    if gran == 'monthly':
        collect_date = get_monthly_date_list()
    for x in collect_date:
        if x[0][:7] in date_list:
            date_list[x[0][:7]].append(x[0])
        else:
            date_list[x[0][:7]] = [x[0]]
    date_list = sorted(date_list.items(), key=lambda d: datetime.datetime.strptime(d[0] + str(-1), '%Y-%m-%d'),
                       reverse=False)
    return date_list


def check_not_empty(df, date):
    empty_count = df.filter("est_average_active_users is null").count()
    if empty_count != 0:
        print "AU is Not Empty Test Fail!!! empty_count: {}, date: {}".format(empty_count, date)
    else:
        print "AU is Not Empty Test Pass! date: {}".format(date)


def check_percentage_accuracy(df, date):
    illegal_percentage_count = df.filter(
        "est_install_penetration>1 or est_open_rate>1 or est_percent_of_wifi_total>1 "
        "or est_percentage_active_days>1 or est_share_of_category_bytes>1 or est_share_of_category_session>1 "
        "or est_share_of_category_time>1 or est_share_of_installs>1 or est_share_of_users>1 "
        "or est_usage_penetration>1").count()
    if illegal_percentage_count != 0:
        print "Percentage<1 Test Fail!!! illegal_percentage_count: {}, date: {}".format(illegal_percentage_count, date)
    else:
        print "Percentage<1 Test Pass! date: {}".format(date)


def check_usage_unified_v1_v3_accuracy(_granularity, date_list):
    """
        date_list:
                [(month,[day1,day2,day3])]
        sample:
            [('2015-12', ['2015-12-27', '2015-12-28', '2015-12-29', '2015-12-30', '2015-12-31'])]
    """
    for month in date_list:
        unified_v1_path = 's3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v1/fact/' \
                          'granularity={v1_granularity}/date={v1_date}/'
        unified_v3_path = 's3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v3/fact/' \
                          'granularity={v3_granularity}/date={v3_date}/'
        sample_index = random.randint(0, len(month[1]) - 1)
        date = month[1][sample_index]
        unified_v1_path_parse = unified_v1_path.format(v1_granularity=_granularity, v1_date=date)
        unified_v3_path_parse = unified_v3_path.format(v3_granularity=_granularity, v3_date=date)

        unified_v1 = spark.read.parquet(unified_v1_path_parse)
        unified_v1 = (
            unified_v1
            .drop("est_mb_per_second", "est_share_of_device_time", "est_share_of_device_session",
                  "est_share_of_device_mb", "est_panel_size")
            .withColumn('est_total_time',
                        unified_v1['est_average_active_users'] * unified_v1['est_average_time_per_user'] / 60)
            .withColumn('est_average_time_per_user', coalesce(unified_v1['est_average_time_per_user'],
                                                              unified_v1['est_average_session_duration'] *
                                                              unified_v1['est_average_session_per_user']) * 1000)
            .withColumn('est_average_session_duration', unified_v1['est_average_session_duration'] * 1000)
        )
        unified_v1 = (
            unified_v1
            .withColumn('est_average_bytes_per_session', unified_v1['est_average_bytes_per_session'] * 1024 * 1024)
            .withColumn('est_average_bytes_per_user', unified_v1['est_average_bytes_per_user'] * 1024 * 1024)
            .withColumn('est_share_of_category_bytes', unified_v1['est_share_of_category_bytes'] * 1024 * 1024)
            .withColumn('est_install_base',
                        unified_v1['est_install_penetration'] *
                        unified_v1['est_average_active_users'] / unified_v1['est_usage_penetration'])
            .withColumn('est_population',
                        unified_v1['est_average_active_users'] / unified_v1['est_usage_penetration'])
            .withColumnRenamed('est_average_active_users_country_share', 'est_share_of_users')
            .withColumnRenamed('est_installs_country_share', 'est_share_of_installs')
            .withColumn('est_total_sessions',
                        unified_v1['est_average_active_users'] * unified_v1['est_average_session_per_user'])
        )
        unified_v1.createOrReplaceTempView("v1_df")
        unified_v1 = spark.sql("""
                    select
                        v1_df.*,
                        ww.est_average_active_users as est_average_active_users_worldwide,
                        ww.est_installs as est_installs_worldwide
                    from v1_df left join
                        (select device_code, app_id, est_installs, est_average_active_users
                         from v1_df where country_code ='WW'
                         ) AS ww
                    on (v1_df.app_id=ww.app_id) and (v1_df.device_code = ww.device_code)
                    """)
        unified_v1 = unified_v1.na.fill(0).drop('_identifier')

        unified_v3 = spark.read.format("delta").load(unified_v3_path_parse).drop('_identifier', 'date', 'granularity')

        check_percentage_accuracy(unified_v3, date)
        check_not_empty(unified_v3, date)

        subtract_count = unified_v1.select(unified_v3.columns).subtract(unified_v3).count()
        subtract_count_reverse = unified_v3.select(unified_v1.columns).subtract(unified_v1).count()
        if subtract_count != 0 or subtract_count_reverse != 0:
            print 'Accuracy Test FAIL!!!! subtract count: {}, date: {}'.format(
                max(subtract_count, subtract_count_reverse), date)
        else:
            print 'Accuracy Test PASS! subtract count: {}, date: {}'.format(
                max(subtract_count, subtract_count_reverse), date)


granularity_list = ["monthly"]
for granularity in granularity_list:
    check_usage_unified_v1_v3_accuracy(granularity, get_path_date_list(granularity))
print 'pass'